library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.2.1
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(rpart)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(lift)
library(adabag)
## Warning: package 'adabag' was built under R version 4.2.1
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Warning: package 'doParallel' was built under R version 4.2.1
## Loading required package: iterators
## Loading required package: parallel
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(clusterSim)
## Warning: package 'clusterSim' was built under R version 4.2.1
## Loading required package: cluster
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
Load the Data
ebay = read.csv('C:/Users/Public/eBayAuctions.csv')
head(ebay)
## Category currency sellerRating Duration endDay ClosePrice OpenPrice
## 1 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## 2 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## 3 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## 4 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## 5 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## 6 Music/Movie/Game US 3249 5 Mon 0.01 0.01
## Competitive.
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Convert the Categorical data into Numeric data
ebay= ebay[,-1]
data_dummies = dummy_cols(ebay, remove_first_dummy = TRUE,
select_columns =colnames(select_if(ebay,
is.character)),remove_selected_columns = TRUE)
data_dummies$Competitive.= as.factor(data_dummies$Competitive.)
head(data_dummies)
## sellerRating Duration ClosePrice OpenPrice Competitive. currency_GBP
## 1 3249 5 0.01 0.01 0 0
## 2 3249 5 0.01 0.01 0 0
## 3 3249 5 0.01 0.01 0 0
## 4 3249 5 0.01 0.01 0 0
## 5 3249 5 0.01 0.01 0 0
## 6 3249 5 0.01 0.01 0 0
## currency_US endDay_Mon endDay_Sat endDay_Sun endDay_Thu endDay_Tue endDay_Wed
## 1 1 1 0 0 0 0 0
## 2 1 1 0 0 0 0 0
## 3 1 1 0 0 0 0 0
## 4 1 1 0 0 0 0 0
## 5 1 1 0 0 0 0 0
## 6 1 1 0 0 0 0 0
Splitting the data into train and test
n = nrow(data_dummies)
set.seed(101)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = data_dummies[trainIndex ,]
test = data_dummies[-trainIndex ,]
fit = rpart(Competitive. ~., data = train)
printcp(fit)
##
## Classification tree:
## rpart(formula = Competitive. ~ ., data = train)
##
## Variables actually used in tree construction:
## [1] ClosePrice OpenPrice sellerRating
##
## Root node error: 645/1380 = 0.46739
##
## n= 1380
##
## CP nsplit rel error xerror xstd
## 1 0.243411 0 1.00000 1.00000 0.028736
## 2 0.112403 1 0.75659 0.75814 0.027548
## 3 0.083721 3 0.53178 0.54109 0.025035
## 4 0.043411 4 0.44806 0.45891 0.023641
## 5 0.041860 5 0.40465 0.43721 0.023223
## 6 0.010000 7 0.32093 0.33488 0.020927
fit.pruned = prune(fit, cp = 0.010000)
pred <- predict(fit.pruned, test[,-5],type = "class")
confusionMatrix(table(test$Competitive.,pred))
## Confusion Matrix and Statistics
##
## pred
## 0 1
## 0 237 24
## 1 68 263
##
## Accuracy : 0.8446
## 95% CI : (0.8129, 0.8728)
## No Information Rate : 0.5152
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6903
##
## Mcnemar's Test P-Value : 7.358e-06
##
## Sensitivity : 0.7770
## Specificity : 0.9164
## Pos Pred Value : 0.9080
## Neg Pred Value : 0.7946
## Prevalence : 0.5152
## Detection Rate : 0.4003
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.8467
##
## 'Positive' Class : 0
##
TopDecileLift(pred, test$Competitive.)
## [1] 1.789
ebay_boosting= boosting(Competitive. ~., data=train)
pred = predict(ebay_boosting, test)
print(pred$confusion)
## Observed Class
## Predicted Class 0 1
## 0 246 30
## 1 15 301
confusionMatrix(pred$confusion)
## Confusion Matrix and Statistics
##
## Observed Class
## Predicted Class 0 1
## 0 246 30
## 1 15 301
##
## Accuracy : 0.924
## 95% CI : (0.8996, 0.944)
## No Information Rate : 0.5591
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8467
##
## Mcnemar's Test P-Value : 0.03689
##
## Sensitivity : 0.9425
## Specificity : 0.9094
## Pos Pred Value : 0.8913
## Neg Pred Value : 0.9525
## Prevalence : 0.4409
## Detection Rate : 0.4155
## Detection Prevalence : 0.4662
## Balanced Accuracy : 0.9259
##
## 'Positive' Class : 0
##
TopDecileLift(as.numeric(pred$class), test$Competitive.)
## [1] 1.789
ebay_bagging <- bagging(Competitive.~., data=train)
pred = predict(ebay_bagging, test)
print(pred$confusion)
## Observed Class
## Predicted Class 0 1
## 0 243 62
## 1 18 269
confusionMatrix(pred$confusion)
## Confusion Matrix and Statistics
##
## Observed Class
## Predicted Class 0 1
## 0 243 62
## 1 18 269
##
## Accuracy : 0.8649
## 95% CI : (0.8347, 0.8914)
## No Information Rate : 0.5591
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7307
##
## Mcnemar's Test P-Value : 1.528e-06
##
## Sensitivity : 0.9310
## Specificity : 0.8127
## Pos Pred Value : 0.7967
## Neg Pred Value : 0.9373
## Prevalence : 0.4409
## Detection Rate : 0.4105
## Detection Prevalence : 0.5152
## Balanced Accuracy : 0.8719
##
## 'Positive' Class : 0
##
TopDecileLift(as.numeric(pred$class), test$Competitive.)
## [1] 1.789
ebay.rf <- randomForest(Competitive.~., data=train, mtry=4)
pred = predict(ebay.rf, test)
confusionMatrix(table(test$Competitive.,pred))
## Confusion Matrix and Statistics
##
## pred
## 0 1
## 0 240 21
## 1 44 287
##
## Accuracy : 0.8902
## 95% CI : (0.8622, 0.9142)
## No Information Rate : 0.5203
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7793
##
## Mcnemar's Test P-Value : 0.006357
##
## Sensitivity : 0.8451
## Specificity : 0.9318
## Pos Pred Value : 0.9195
## Neg Pred Value : 0.8671
## Prevalence : 0.4797
## Detection Rate : 0.4054
## Detection Prevalence : 0.4409
## Balanced Accuracy : 0.8884
##
## 'Positive' Class : 0
##
TopDecileLift(pred, test$Competitive.)
## [1] 1.789
flights = read.csv('C:/Users/Public/FlightDelays.csv')
head(flights)
## CRS_DEP_TIME CARRIER DEP_TIME DEST DISTANCE FL_DATE FL_NUM ORIGIN Weather
## 1 1455 OH 1455 JFK 184 01/01/2004 5935 BWI 0
## 2 1640 DH 1640 JFK 213 01/01/2004 6155 DCA 0
## 3 1245 DH 1245 LGA 229 01/01/2004 7208 IAD 0
## 4 1715 DH 1709 LGA 229 01/01/2004 7215 IAD 0
## 5 1039 DH 1035 LGA 229 01/01/2004 7792 IAD 0
## 6 840 DH 839 JFK 228 01/01/2004 7800 IAD 0
## DAY_WEEK DAY_OF_MONTH TAIL_NUM Flight.Status
## 1 4 1 N940CA ontime
## 2 4 1 N405FJ ontime
## 3 4 1 N695BR ontime
## 4 4 1 N662BR ontime
## 5 4 1 N698BR ontime
## 6 4 1 N687BR ontime
Transforming variable day of week info a categorical variable.
str(flights)
## 'data.frame': 2201 obs. of 13 variables:
## $ CRS_DEP_TIME : int 1455 1640 1245 1715 1039 840 1240 1645 1715 2120 ...
## $ CARRIER : chr "OH" "DH" "DH" "DH" ...
## $ DEP_TIME : int 1455 1640 1245 1709 1035 839 1243 1644 1710 2129 ...
## $ DEST : chr "JFK" "JFK" "LGA" "LGA" ...
## $ DISTANCE : int 184 213 229 229 229 228 228 228 228 228 ...
## $ FL_DATE : chr "01/01/2004" "01/01/2004" "01/01/2004" "01/01/2004" ...
## $ FL_NUM : int 5935 6155 7208 7215 7792 7800 7806 7810 7812 7814 ...
## $ ORIGIN : chr "BWI" "DCA" "IAD" "IAD" ...
## $ Weather : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DAY_WEEK : int 4 4 4 4 4 4 4 4 4 4 ...
## $ DAY_OF_MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ TAIL_NUM : chr "N940CA" "N405FJ" "N695BR" "N662BR" ...
## $ Flight.Status: chr "ontime" "ontime" "ontime" "ontime" ...
flights$DAY_WEEK= as.factor(flights$DAY_WEEK)
Used cut() to bin the scheduled departure time into eight bins.
flights$DEP_TIME=cut(flights$DEP_TIME, b = 8,labels = c(1,2,3,4,5,6,7,8))
Removing the unwanted data from the data and Converting the Output to Factor
flights = flights[,-c(6,7,12)]
flights$Flight.Status = ifelse(flights$Flight.Status== 'delayed',1,0)
Creating Dummies and Standardizing the data
flights_dummies = dummy_cols(flights, select_columns = c("CARRIER",
"DEST",'ORIGIN','DAY_WEEK'),
remove_first_dummy = TRUE,remove_selected_columns = TRUE)
flight_stand = data.Normalization(flights_dummies[,-
c(6)],type="n1",normalization="column")
## Warning in data.Normalization(flights_dummies[, -c(6)], type = "n1",
## normalization = "column"): Data not numeric, normalization not applicable
flight_stand$Flight.Status = as.factor(flights_dummies$Flight.Status)
#Splitting the data into train and test
n = nrow(flight_stand)
set.seed(101)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = flight_stand[trainIndex ,]
test = flight_stand[-trainIndex ,]
Traning the CART model
fit = rpart(Flight.Status ~., data = train)
printcp(fit)
##
## Classification tree:
## rpart(formula = Flight.Status ~ ., data = train)
##
## Variables actually used in tree construction:
## [1] CRS_DEP_TIME DEP_TIME Weather
##
## Root node error: 300/1541 = 0.19468
##
## n= 1541
##
## CP nsplit rel error xerror xstd
## 1 0.073333 0 1.00000 1.00000 0.051811
## 2 0.036667 1 0.92667 0.91667 0.050103
## 3 0.010000 5 0.74667 0.75667 0.046375
fit.pruned = prune(fit, cp = 0.010000)
pred <- predict(fit.pruned, test,type = "class")
confusionMatrix(table(test$Flight.Status,pred))
## Confusion Matrix and Statistics
##
## pred
## 0 1
## 0 523 9
## 1 91 37
##
## Accuracy : 0.8485
## 95% CI : (0.8188, 0.875)
## No Information Rate : 0.9303
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3596
##
## Mcnemar's Test P-Value : 5.496e-16
##
## Sensitivity : 0.8518
## Specificity : 0.8043
## Pos Pred Value : 0.9831
## Neg Pred Value : 0.2891
## Prevalence : 0.9303
## Detection Rate : 0.7924
## Detection Prevalence : 0.8061
## Balanced Accuracy : 0.8281
##
## 'Positive' Class : 0
##
flights_boosting= boosting(Flight.Status~ ., data=train)
pred = predict(flights_boosting, test)
print(pred$confusion)
## Observed Class
## Predicted Class 0 1
## 0 503 75
## 1 29 53
confusionMatrix(pred$confusion)
## Confusion Matrix and Statistics
##
## Observed Class
## Predicted Class 0 1
## 0 503 75
## 1 29 53
##
## Accuracy : 0.8424
## 95% CI : (0.8123, 0.8694)
## No Information Rate : 0.8061
## P-Value [Acc > NIR] : 0.009091
##
## Kappa : 0.4164
##
## Mcnemar's Test P-Value : 1.021e-05
##
## Sensitivity : 0.9455
## Specificity : 0.4141
## Pos Pred Value : 0.8702
## Neg Pred Value : 0.6463
## Prevalence : 0.8061
## Detection Rate : 0.7621
## Detection Prevalence : 0.8758
## Balanced Accuracy : 0.6798
##
## 'Positive' Class : 0
##