library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.2
#Im going alias weekly as W. For profiling purposes#
W<-Weekly
Weekly is S&P 500 stock index data with the first variable, Year, btheeing year the observation was recorded. The next 5 variables , Lag1:Lag5, represent the percentage return for the previous week, return for the 2 weeks previous, … ,for 5 weeks previous.
Variable no. 7 is Volume, meaning the average number of daily shares traded in billions. Today, the 8th column, is the percentage return for this week, and Direction, the final and 9th column, is a factor with levels Down and Up indicating whether the market had a positive or negative return on a given week
#Look at the structure#
str(W)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
#Peek at the dataset#
head(W)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
#Look at the correlations between numeric values#
corrplot::corrplot(cor(W[1:8]))
#Year and Volume stick out, lets look#
cor(W$Year,W$Volume)
## [1] 0.8419416
#It looks like they have a strong positive correlation#
W.glm.fit<-glm(Direction~.-Year-Today, family = binomial, data = W)
#It looks like only Lag2 or return for the 2 weeks previous, is significant with a coefficient of .05844 and p values, 0.0296#
summary(W.glm.fit)
##
## Call:
## glm(formula = Direction ~ . - Year - Today, family = binomial,
## data = W)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
W.glm.fit<-glm(Direction~.-Year-Today, family = binomial, data = W)
W.probz <- predict(W.glm.fit,W, type = "response")
W.pred.glm <- rep("Down", length(W.probz))
W.pred.glm<-as.factor(ifelse(W.probz > 0.5,"Up","Down"))
caret::confusionMatrix(W.pred.glm, W$Direction)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.11157
## Specificity : 0.92066
## Pos Pred Value : 0.52941
## Neg Pred Value : 0.56434
## Prevalence : 0.44444
## Detection Rate : 0.04959
## Detection Prevalence : 0.09366
## Balanced Accuracy : 0.51612
##
## 'Positive' Class : Down
##
#Here, I make the training set with the years between 1990 and 2008 and only keep the variables I need
train<-W[W$Year<=2008,c("Lag2","Direction")]
#And here i make the testing set with years 2009 and 2010.
test<-W[W$Year>2008,c("Lag2","Direction")]
#Model
W.log.train<-glm(Direction~., data = train, family = binomial)
#Confusion Matrix
testpredz<-predict(W.log.train, test,type='response')
tp_dirk<-rep('Down', length(testpredz))
tp_dirk<-as.factor(ifelse(testpredz > 0.5,"Up","Down"))
caret::confusionMatrix(tp_dirk, test$Direction)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
library(MASS)
W.lda.train<-lda(Direction~., data = train)
testpredz<-predict(W.lda.train, test,type='response')
tp_dirk<-testpredz$class
caret::confusionMatrix(table(tp_dirk, Test.Direction = test$Direction))
## Confusion Matrix and Statistics
##
## Test.Direction
## tp_dirk Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.20930
## Specificity : 0.91803
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.62222
## Prevalence : 0.41346
## Detection Rate : 0.08654
## Detection Prevalence : 0.13462
## Balanced Accuracy : 0.56367
##
## 'Positive' Class : Down
##
W.qda.train<-qda(Direction~., data = train)
testpredz<-predict(W.qda.train, test,type='response')
tp_dirk<-testpredz$class
caret::confusionMatrix(table(tp_dirk, Test.Direction = test$Direction))
## Confusion Matrix and Statistics
##
## Test.Direction
## tp_dirk Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5865
## Prevalence : 0.4135
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Down
##
library(class)
#Get the x variable for train, and y variable for train
train.x<-as.matrix(W[W$Year<=2008,]$Lag2)
train.y<-W[W$Year<=2008,]$Direction
#Get the x variable for test, and y variable for test
test.x<-as.matrix(W[W$Year>2008,]$Lag2)
test.y<-W[W$Year>2008,]$Direction
knn.pred<-knn(train.x,test.x,train.y, k=1)
caret::confusionMatrix(knn.pred,test.y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 29
## Up 22 32
##
## Accuracy : 0.5096
## 95% CI : (0.4097, 0.609)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9540
##
## Kappa : 0.0127
## Mcnemar's Test P-Value : 0.4008
##
## Sensitivity : 0.4884
## Specificity : 0.5246
## Pos Pred Value : 0.4200
## Neg Pred Value : 0.5926
## Prevalence : 0.4135
## Detection Rate : 0.2019
## Detection Prevalence : 0.4808
## Balanced Accuracy : 0.5065
##
## 'Positive' Class : Down
##
a<-Auto
a$mpg01<-ifelse(a$mpg>median(a$mpg),1,0)
cor(a[,-9],a$mpg01)
## [,1]
## mpg 0.8369392
## cylinders -0.7591939
## displacement -0.7534766
## horsepower -0.6670526
## weight -0.7577566
## acceleration 0.3468215
## year 0.4299042
## origin 0.5136984
## mpg01 1.0000000
corrplot::corrplot(cor(a[,-9],a$mpg01))
library(car)
## Warning: package 'car' was built under R version 3.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.5.2
library(lattice)
super.sym <- trellis.par.get("superpose.symbol")
splom(a[c(-1,-9,-10)], groups=a$mpg01, data=a, panel= panel.superpose,key=list(title="Over/Under MPG Median", columns=2,points=list(pch=super.sym$pch[1:2], col=super.sym$col[1:2]) ,text=list(c('Over MPG Median','Under MPG Median'))))
## Warning in splom.data.frame(a[c(-1, -9, -10)], groups = a$mpg01, data =
## a, : explicit 'data' specification ignored
a$origin<-as.factor(a$origin)
a$mpg01<-as.factor(a$mpg01)
set.seed(2020)
train<-sample(nrow(a),nrow(a)*.8)
a.train<-a[train,c(-1,-9)]
a.test<-a[-train,c(-1,-9)]
#Im going to use the variables: cylinders, displacement, horsepower, weight, acceleration, year, and origin as the independent variables. mpg01 is the dependent variable
a.lda.train<-lda(mpg01~., data = a.train)
testpredz<-predict(a.lda.train, a.test,type='response')
tp_mpg<-testpredz$class
caret::confusionMatrix(tp_mpg, a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 3
## 1 6 37
##
## Accuracy : 0.8861
## 95% CI : (0.7947, 0.9466)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 8.396e-13
##
## Kappa : 0.7719
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.8462
## Specificity : 0.9250
## Pos Pred Value : 0.9167
## Neg Pred Value : 0.8605
## Prevalence : 0.4937
## Detection Rate : 0.4177
## Detection Prevalence : 0.4557
## Balanced Accuracy : 0.8856
##
## 'Positive' Class : 0
##
a.qda.train<-qda(mpg01~., data = a.train)
confit<-function(model,test_ds, test_ds_testvar){
testpredz<-predict(model, test_ds,type='response')
caret::confusionMatrix(testpredz$class, test_ds_testvar)
}
confit(a.qda.train,a.test,a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 3
## 1 4 37
##
## Accuracy : 0.9114
## 95% CI : (0.8259, 0.9636)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 1.201e-14
##
## Kappa : 0.8227
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8974
## Specificity : 0.9250
## Pos Pred Value : 0.9211
## Neg Pred Value : 0.9024
## Prevalence : 0.4937
## Detection Rate : 0.4430
## Detection Prevalence : 0.4810
## Balanced Accuracy : 0.9112
##
## 'Positive' Class : 0
##
a.log.train<-glm(mpg01~., data = a.train, family = 'binomial')
testpredz<-predict(a.log.train, a.test,type='response')
a.pred.glm<-as.factor(ifelse(testpredz>.5,1,0))
caret::confusionMatrix(a.pred.glm, a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 6
## 1 4 34
##
## Accuracy : 0.8734
## 95% CI : (0.7795, 0.9376)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 5.838e-12
##
## Kappa : 0.747
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.8974
## Specificity : 0.8500
## Pos Pred Value : 0.8537
## Neg Pred Value : 0.8947
## Prevalence : 0.4937
## Detection Rate : 0.4430
## Detection Prevalence : 0.5190
## Balanced Accuracy : 0.8737
##
## 'Positive' Class : 0
##
train.y<-(a.train[,8])
train.x<-(a.train[,-8])
test.x<-(a.test[,-8])
test.y<-(a.test[,8])
knn.pred<-class::knn(train.x,test.x,train.y, k=1)
caret::confusionMatrix(knn.pred,a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 7
## 1 6 33
##
## Accuracy : 0.8354
## 95% CI : (0.7351, 0.9094)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 1.054e-09
##
## Kappa : 0.6709
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8462
## Specificity : 0.8250
## Pos Pred Value : 0.8250
## Neg Pred Value : 0.8462
## Prevalence : 0.4937
## Detection Rate : 0.4177
## Detection Prevalence : 0.5063
## Balanced Accuracy : 0.8356
##
## 'Positive' Class : 0
##
knn.pred<-class::knn(train.x,test.x,train.y, k=2)
caret::confusionMatrix(knn.pred,a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 9
## 1 6 31
##
## Accuracy : 0.8101
## 95% CI : (0.7062, 0.8897)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 2.144e-08
##
## Kappa : 0.6206
## Mcnemar's Test P-Value : 0.6056
##
## Sensitivity : 0.8462
## Specificity : 0.7750
## Pos Pred Value : 0.7857
## Neg Pred Value : 0.8378
## Prevalence : 0.4937
## Detection Rate : 0.4177
## Detection Prevalence : 0.5316
## Balanced Accuracy : 0.8106
##
## 'Positive' Class : 0
##
knn.pred<-class::knn(train.x,test.x,train.y, k=5)
caret::confusionMatrix(knn.pred,a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34 8
## 1 5 32
##
## Accuracy : 0.8354
## 95% CI : (0.7351, 0.9094)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 1.054e-09
##
## Kappa : 0.6711
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.8718
## Specificity : 0.8000
## Pos Pred Value : 0.8095
## Neg Pred Value : 0.8649
## Prevalence : 0.4937
## Detection Rate : 0.4304
## Detection Prevalence : 0.5316
## Balanced Accuracy : 0.8359
##
## 'Positive' Class : 0
##
knn.pred<-class::knn(train.x,test.x,train.y, k=1)
caret::confusionMatrix(knn.pred,a.test$mpg01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 7
## 1 6 33
##
## Accuracy : 0.8354
## 95% CI : (0.7351, 0.9094)
## No Information Rate : 0.5063
## P-Value [Acc > NIR] : 1.054e-09
##
## Kappa : 0.6709
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8462
## Specificity : 0.8250
## Pos Pred Value : 0.8250
## Neg Pred Value : 0.8462
## Prevalence : 0.4937
## Detection Rate : 0.4177
## Detection Prevalence : 0.5063
## Balanced Accuracy : 0.8356
##
## 'Positive' Class : 0
##
crim-per capita crime rate by town. zn-proportion of residential land zoned for lots over 25,000 sq.ft. indus-proportion of non-retail business acres per town. chas-Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). nox-nitrogen oxides concentration (parts per 10 million). rm-average number of rooms per dwelling. age-proportion of owner-occupied units built prior to 1940. dis-weighted mean of distances to five Boston employment centres. rad-index of accessibility to radial highways. tax-full-value property-tax rate per $10,000. ptratio-pupil-teacher ratio by town. black-1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. lstat-lower status of the population (percent). medv-median value of owner-occupied homes in $1000s
b<-Boston
b$crim01<-ifelse(b$crim>median(b$crim),1,0)
cor(b[,-15],b$crim01)
## [,1]
## crim 0.40939545
## zn -0.43615103
## indus 0.60326017
## chas 0.07009677
## nox 0.72323480
## rm -0.15637178
## age 0.61393992
## dis -0.61634164
## rad 0.61978625
## tax 0.60874128
## ptratio 0.25356836
## black -0.35121093
## lstat 0.45326273
## medv -0.26301673
b$crim01<-as.factor(ifelse(b$crim>median(b$crim),1,0))
#Goes from all, to those with a correlation greater than .4, and then to those with a correlation > .6.#
var<-1 #do -var #
var1<-c('zn','indus','nox','age','dis','rad','tax','lstat','crim01')
var2<-c('indus','nox','age','dis','rad','tax','crim01')
#Making samples#
set.seed(111)
train<-sample(nrow(b),nrow(b)*.8)
train1<-sample(nrow(b),nrow(b)*.7)
train2<-sample(nrow(b),nrow(b)*.825)
#First set of Var#
b.train<-b[train,-var]
b.test<-b[-train,-var]
b.train01<-b[train1,-var]
b.test01<-b[-train1,-var]
b.train02<-b[train2,-var]
b.test02<-b[-train2,-var]
#Second set of Var#
b.train1<-b[train1,var1]
b.test1<-b[-train1,var1]
b.train10<-b[train,var1]
b.test10<-b[-train,var1]
b.train12<-b[train2,var1]
b.test12<-b[-train2,var1]
#Third Set of Variables
b.train2<-b[train2,var2]
b.test2<-b[-train2,var2]
b.train20<-b[train,var2]
b.test20<-b[-train,var2]
b.train21<-b[train1,var2]
b.test21<-b[-train1,var2]
b.lda.train<-lda(crim01 ~ ., data = b.train)
confit(b.lda.train,b.test,b.test$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 46 13
## 1 1 42
##
## Accuracy : 0.8627
## 95% CI : (0.7804, 0.9229)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 3.507e-12
##
## Kappa : 0.7288
## Mcnemar's Test P-Value : 0.003283
##
## Sensitivity : 0.9787
## Specificity : 0.7636
## Pos Pred Value : 0.7797
## Neg Pred Value : 0.9767
## Prevalence : 0.4608
## Detection Rate : 0.4510
## Detection Prevalence : 0.5784
## Balanced Accuracy : 0.8712
##
## 'Positive' Class : 0
##
b.lda.train01<-lda(crim01 ~ ., data = b.train01)
confit(b.lda.train01,b.test01,b.test01$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 20
## 1 2 60
##
## Accuracy : 0.8553
## 95% CI : (0.7891, 0.907)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7133
## Mcnemar's Test P-Value : 0.0002896
##
## Sensitivity : 0.9722
## Specificity : 0.7500
## Pos Pred Value : 0.7778
## Neg Pred Value : 0.9677
## Prevalence : 0.4737
## Detection Rate : 0.4605
## Detection Prevalence : 0.5921
## Balanced Accuracy : 0.8611
##
## 'Positive' Class : 0
##
b.lda.train02<-lda(crim01 ~ ., data = b.train02)
confit(b.lda.train02,b.test02,b.test02$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 9
## 1 5 42
##
## Accuracy : 0.8427
## 95% CI : (0.7502, 0.9112)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 4.914e-08
##
## Kappa : 0.6828
## Mcnemar's Test P-Value : 0.4227
##
## Sensitivity : 0.8684
## Specificity : 0.8235
## Pos Pred Value : 0.7857
## Neg Pred Value : 0.8936
## Prevalence : 0.4270
## Detection Rate : 0.3708
## Detection Prevalence : 0.4719
## Balanced Accuracy : 0.8460
##
## 'Positive' Class : 0
##
b.lda.train1<-lda(crim01 ~ ., data = b.train1)
confit(b.lda.train1,b.test1,b.test1$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 68 20
## 1 4 60
##
## Accuracy : 0.8421
## 95% CI : (0.7742, 0.8961)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 2.34e-16
##
## Kappa : 0.6868
## Mcnemar's Test P-Value : 0.0022
##
## Sensitivity : 0.9444
## Specificity : 0.7500
## Pos Pred Value : 0.7727
## Neg Pred Value : 0.9375
## Prevalence : 0.4737
## Detection Rate : 0.4474
## Detection Prevalence : 0.5789
## Balanced Accuracy : 0.8472
##
## 'Positive' Class : 0
##
b.lda.train10<-lda(crim01 ~ ., data = b.train10)
confit(b.lda.train10,b.test10,b.test10$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 46 13
## 1 1 42
##
## Accuracy : 0.8627
## 95% CI : (0.7804, 0.9229)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 3.507e-12
##
## Kappa : 0.7288
## Mcnemar's Test P-Value : 0.003283
##
## Sensitivity : 0.9787
## Specificity : 0.7636
## Pos Pred Value : 0.7797
## Neg Pred Value : 0.9767
## Prevalence : 0.4608
## Detection Rate : 0.4510
## Detection Prevalence : 0.5784
## Balanced Accuracy : 0.8712
##
## 'Positive' Class : 0
##
b.lda.train12<-lda(crim01 ~ ., data = b.train12)
confit(b.lda.train12,b.test12,b.test12$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 12
## 1 7 39
##
## Accuracy : 0.7865
## 95% CI : (0.6869, 0.8663)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 1.948e-05
##
## Kappa : 0.5709
## Mcnemar's Test P-Value : 0.3588
##
## Sensitivity : 0.8158
## Specificity : 0.7647
## Pos Pred Value : 0.7209
## Neg Pred Value : 0.8478
## Prevalence : 0.4270
## Detection Rate : 0.3483
## Detection Prevalence : 0.4831
## Balanced Accuracy : 0.7902
##
## 'Positive' Class : 0
##
b.lda.train2<-lda(crim01 ~ ., data = b.train2)
confit(b.lda.train2,b.test2,b.test2$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 32 12
## 1 6 39
##
## Accuracy : 0.7978
## 95% CI : (0.6993, 0.8755)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 6.765e-06
##
## Kappa : 0.5948
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.8421
## Specificity : 0.7647
## Pos Pred Value : 0.7273
## Neg Pred Value : 0.8667
## Prevalence : 0.4270
## Detection Rate : 0.3596
## Detection Prevalence : 0.4944
## Balanced Accuracy : 0.8034
##
## 'Positive' Class : 0
##
b.lda.train20<-lda(crim01 ~ ., data = b.train20)
confit(b.lda.train20,b.test20,b.test20$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 12
## 1 2 43
##
## Accuracy : 0.8627
## 95% CI : (0.7804, 0.9229)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 3.507e-12
##
## Kappa : 0.728
## Mcnemar's Test P-Value : 0.01616
##
## Sensitivity : 0.9574
## Specificity : 0.7818
## Pos Pred Value : 0.7895
## Neg Pred Value : 0.9556
## Prevalence : 0.4608
## Detection Rate : 0.4412
## Detection Prevalence : 0.5588
## Balanced Accuracy : 0.8696
##
## 'Positive' Class : 0
##
b.lda.train21<-lda(crim01 ~ ., data = b.train21)
confit(b.lda.train21,b.test21,b.test21$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 20
## 1 3 60
##
## Accuracy : 0.8487
## 95% CI : (0.7817, 0.9016)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7001
## Mcnemar's Test P-Value : 0.0008492
##
## Sensitivity : 0.9583
## Specificity : 0.7500
## Pos Pred Value : 0.7753
## Neg Pred Value : 0.9524
## Prevalence : 0.4737
## Detection Rate : 0.4539
## Detection Prevalence : 0.5855
## Balanced Accuracy : 0.8542
##
## 'Positive' Class : 0
##
b.qda.train<-qda(crim01 ~ ., data = b.train)
confit(b.qda.train,b.test,b.test$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 46 11
## 1 1 44
##
## Accuracy : 0.8824
## 95% CI : (0.8035, 0.9377)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 1.054e-13
##
## Kappa : 0.7669
## Mcnemar's Test P-Value : 0.009375
##
## Sensitivity : 0.9787
## Specificity : 0.8000
## Pos Pred Value : 0.8070
## Neg Pred Value : 0.9778
## Prevalence : 0.4608
## Detection Rate : 0.4510
## Detection Prevalence : 0.5588
## Balanced Accuracy : 0.8894
##
## 'Positive' Class : 0
##
b.qda.train01<-qda(crim01 ~ ., data = b.train01)
confit(b.qda.train01,b.test01,b.test01$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 71 14
## 1 1 66
##
## Accuracy : 0.9013
## 95% CI : (0.8425, 0.9437)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8039
## Mcnemar's Test P-Value : 0.001946
##
## Sensitivity : 0.9861
## Specificity : 0.8250
## Pos Pred Value : 0.8353
## Neg Pred Value : 0.9851
## Prevalence : 0.4737
## Detection Rate : 0.4671
## Detection Prevalence : 0.5592
## Balanced Accuracy : 0.9056
##
## 'Positive' Class : 0
##
b.qda.train02<-qda(crim01 ~ ., data = b.train02)
confit(b.qda.train02,b.test02,b.test02$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 8
## 1 1 43
##
## Accuracy : 0.8989
## 95% CI : (0.8167, 0.9527)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 1.584e-11
##
## Kappa : 0.7981
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9737
## Specificity : 0.8431
## Pos Pred Value : 0.8222
## Neg Pred Value : 0.9773
## Prevalence : 0.4270
## Detection Rate : 0.4157
## Detection Prevalence : 0.5056
## Balanced Accuracy : 0.9084
##
## 'Positive' Class : 0
##
b.qda.train1<-qda(crim01 ~ ., data = b.train1)
confit(b.qda.train1,b.test1,b.test1$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 11
## 1 3 69
##
## Accuracy : 0.9079
## 95% CI : (0.8503, 0.9487)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8163
## Mcnemar's Test P-Value : 0.06137
##
## Sensitivity : 0.9583
## Specificity : 0.8625
## Pos Pred Value : 0.8625
## Neg Pred Value : 0.9583
## Prevalence : 0.4737
## Detection Rate : 0.4539
## Detection Prevalence : 0.5263
## Balanced Accuracy : 0.9104
##
## 'Positive' Class : 0
##
b.qda.train10<-qda(crim01 ~ ., data = b.train10)
confit(b.qda.train10,b.test10,b.test10$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 6
## 1 0 49
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8827
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 1.0000
## Specificity : 0.8909
## Pos Pred Value : 0.8868
## Neg Pred Value : 1.0000
## Prevalence : 0.4608
## Detection Rate : 0.4608
## Detection Prevalence : 0.5196
## Balanced Accuracy : 0.9455
##
## 'Positive' Class : 0
##
b.qda.train12<-qda(crim01 ~ ., data = b.train12)
confit(b.qda.train12,b.test12,b.test12$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 7
## 1 3 44
##
## Accuracy : 0.8876
## 95% CI : (0.8031, 0.9448)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 9.646e-11
##
## Kappa : 0.7734
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.9211
## Specificity : 0.8627
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9362
## Prevalence : 0.4270
## Detection Rate : 0.3933
## Detection Prevalence : 0.4719
## Balanced Accuracy : 0.8919
##
## 'Positive' Class : 0
##
b.qda.train2<-qda(crim01 ~ ., data = b.train2)
confit(b.qda.train2,b.test2,b.test2$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 7
## 1 3 44
##
## Accuracy : 0.8876
## 95% CI : (0.8031, 0.9448)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 9.646e-11
##
## Kappa : 0.7734
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.9211
## Specificity : 0.8627
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.9362
## Prevalence : 0.4270
## Detection Rate : 0.3933
## Detection Prevalence : 0.4719
## Balanced Accuracy : 0.8919
##
## 'Positive' Class : 0
##
b.qda.train20<-qda(crim01 ~ ., data = b.train20)
confit(b.qda.train20,b.test20,b.test20$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 9
## 1 0 46
##
## Accuracy : 0.9118
## 95% CI : (0.8391, 0.9589)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 2.73e-16
##
## Kappa : 0.8249
## Mcnemar's Test P-Value : 0.007661
##
## Sensitivity : 1.0000
## Specificity : 0.8364
## Pos Pred Value : 0.8393
## Neg Pred Value : 1.0000
## Prevalence : 0.4608
## Detection Rate : 0.4608
## Detection Prevalence : 0.5490
## Balanced Accuracy : 0.9182
##
## 'Positive' Class : 0
##
b.qda.train21<-qda(crim01 ~ ., data = b.train21)
confit(b.qda.train21,b.test21,b.test21$crim01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 71 14
## 1 1 66
##
## Accuracy : 0.9013
## 95% CI : (0.8425, 0.9437)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8039
## Mcnemar's Test P-Value : 0.001946
##
## Sensitivity : 0.9861
## Specificity : 0.8250
## Pos Pred Value : 0.8353
## Neg Pred Value : 0.9851
## Prevalence : 0.4737
## Detection Rate : 0.4671
## Detection Prevalence : 0.5592
## Balanced Accuracy : 0.9056
##
## 'Positive' Class : 0
##
b.log.confit<-function(train, test){
b.log.train<-glm(crim01~., data = train, family = 'binomial')
testpredz<-predict(b.log.train, test,type='response')
b.pred.glm<-as.factor(ifelse(testpredz>.5,1,0))
caret::confusionMatrix(b.pred.glm, test$crim01)
}
b.log.confit(b.train,b.test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 2
## 1 0 53
##
## Accuracy : 0.9804
## 95% CI : (0.931, 0.9976)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9607
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.9636
## Pos Pred Value : 0.9592
## Neg Pred Value : 1.0000
## Prevalence : 0.4608
## Detection Rate : 0.4608
## Detection Prevalence : 0.4804
## Balanced Accuracy : 0.9818
##
## 'Positive' Class : 0
##
b.log.confit(b.train01,b.test01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 7
## 1 2 73
##
## Accuracy : 0.9408
## 95% CI : (0.8906, 0.9726)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8817
## Mcnemar's Test P-Value : 0.1824
##
## Sensitivity : 0.9722
## Specificity : 0.9125
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.9733
## Prevalence : 0.4737
## Detection Rate : 0.4605
## Detection Prevalence : 0.5066
## Balanced Accuracy : 0.9424
##
## 'Positive' Class : 0
##
b.log.confit(b.train02,b.test02)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 6
## 1 5 45
##
## Accuracy : 0.8764
## 95% CI : (0.7896, 0.9367)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 5.279e-10
##
## Kappa : 0.7483
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8684
## Specificity : 0.8824
## Pos Pred Value : 0.8462
## Neg Pred Value : 0.9000
## Prevalence : 0.4270
## Detection Rate : 0.3708
## Detection Prevalence : 0.4382
## Balanced Accuracy : 0.8754
##
## 'Positive' Class : 0
##
b.log.confit(b.train1,b.test1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 67 9
## 1 5 71
##
## Accuracy : 0.9079
## 95% CI : (0.8503, 0.9487)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8158
## Mcnemar's Test P-Value : 0.4227
##
## Sensitivity : 0.9306
## Specificity : 0.8875
## Pos Pred Value : 0.8816
## Neg Pred Value : 0.9342
## Prevalence : 0.4737
## Detection Rate : 0.4408
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9090
##
## 'Positive' Class : 0
##
b.log.confit(b.train10,b.test10)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 39 3
## 1 8 52
##
## Accuracy : 0.8922
## 95% CI : (0.8152, 0.9449)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 1.6e-14
##
## Kappa : 0.7813
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.8298
## Specificity : 0.9455
## Pos Pred Value : 0.9286
## Neg Pred Value : 0.8667
## Prevalence : 0.4608
## Detection Rate : 0.3824
## Detection Prevalence : 0.4118
## Balanced Accuracy : 0.8876
##
## 'Positive' Class : 0
##
b.log.confit(b.train12,b.test12)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 29 8
## 1 9 43
##
## Accuracy : 0.809
## 95% CI : (0.7119, 0.8846)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 2.199e-06
##
## Kappa : 0.6083
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7632
## Specificity : 0.8431
## Pos Pred Value : 0.7838
## Neg Pred Value : 0.8269
## Prevalence : 0.4270
## Detection Rate : 0.3258
## Detection Prevalence : 0.4157
## Balanced Accuracy : 0.8031
##
## 'Positive' Class : 0
##
b.log.confit(b.train2,b.test2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 6
## 1 7 45
##
## Accuracy : 0.8539
## 95% CI : (0.7632, 0.9199)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 1.184e-08
##
## Kappa : 0.7005
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8158
## Specificity : 0.8824
## Pos Pred Value : 0.8378
## Neg Pred Value : 0.8654
## Prevalence : 0.4270
## Detection Rate : 0.3483
## Detection Prevalence : 0.4157
## Balanced Accuracy : 0.8491
##
## 'Positive' Class : 0
##
b.log.confit(b.train20,b.test20)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 40 5
## 1 7 50
##
## Accuracy : 0.8824
## 95% CI : (0.8035, 0.9377)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : 1.054e-13
##
## Kappa : 0.7625
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.8511
## Specificity : 0.9091
## Pos Pred Value : 0.8889
## Neg Pred Value : 0.8772
## Prevalence : 0.4608
## Detection Rate : 0.3922
## Detection Prevalence : 0.4412
## Balanced Accuracy : 0.8801
##
## 'Positive' Class : 0
##
b.log.confit(b.train21,b.test21)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 68 14
## 1 4 66
##
## Accuracy : 0.8816
## 95% CI : (0.8193, 0.9283)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7641
## Mcnemar's Test P-Value : 0.03389
##
## Sensitivity : 0.9444
## Specificity : 0.8250
## Pos Pred Value : 0.8293
## Neg Pred Value : 0.9429
## Prevalence : 0.4737
## Detection Rate : 0.4474
## Detection Prevalence : 0.5395
## Balanced Accuracy : 0.8847
##
## 'Positive' Class : 0
##
b.knn.confit<-function(train,test){
train.y<-(train[,"crim01"])
train.x<-(train[,names(train)!="crim01"])
test.x<-(test[,names(test)!="crim01"])
test.y<-(test[,"crim01"])
knn.pred<-class::knn(train.x,test.x,train.y, k=1)
caret::confusionMatrix(knn.pred,test$crim01)
}
b.knn.confit(b.train,b.test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 43 2
## 1 4 53
##
## Accuracy : 0.9412
## 95% CI : (0.8764, 0.9781)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8813
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9149
## Specificity : 0.9636
## Pos Pred Value : 0.9556
## Neg Pred Value : 0.9298
## Prevalence : 0.4608
## Detection Rate : 0.4216
## Detection Prevalence : 0.4412
## Balanced Accuracy : 0.9393
##
## 'Positive' Class : 0
##
b.knn.confit(b.train01,b.test01)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 70 6
## 1 2 74
##
## Accuracy : 0.9474
## 95% CI : (0.8989, 0.977)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8947
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.9722
## Specificity : 0.9250
## Pos Pred Value : 0.9211
## Neg Pred Value : 0.9737
## Prevalence : 0.4737
## Detection Rate : 0.4605
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9486
##
## 'Positive' Class : 0
##
b.knn.confit(b.train02,b.test02)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 32 6
## 1 6 45
##
## Accuracy : 0.8652
## 95% CI : (0.7763, 0.9283)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 2.617e-09
##
## Kappa : 0.7245
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8421
## Specificity : 0.8824
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.8824
## Prevalence : 0.4270
## Detection Rate : 0.3596
## Detection Prevalence : 0.4270
## Balanced Accuracy : 0.8622
##
## 'Positive' Class : 0
##
b.knn.confit(b.train1,b.test1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 67 4
## 1 5 76
##
## Accuracy : 0.9408
## 95% CI : (0.8906, 0.9726)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8812
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9306
## Specificity : 0.9500
## Pos Pred Value : 0.9437
## Neg Pred Value : 0.9383
## Prevalence : 0.4737
## Detection Rate : 0.4408
## Detection Prevalence : 0.4671
## Balanced Accuracy : 0.9403
##
## 'Positive' Class : 0
##
b.knn.confit(b.train10,b.test10)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 1
## 1 2 54
##
## Accuracy : 0.9706
## 95% CI : (0.9164, 0.9939)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9407
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9574
## Specificity : 0.9818
## Pos Pred Value : 0.9783
## Neg Pred Value : 0.9643
## Prevalence : 0.4608
## Detection Rate : 0.4412
## Detection Prevalence : 0.4510
## Balanced Accuracy : 0.9696
##
## 'Positive' Class : 0
##
b.knn.confit(b.train12,b.test12)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 4
## 1 7 47
##
## Accuracy : 0.8764
## 95% CI : (0.7896, 0.9367)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 5.279e-10
##
## Kappa : 0.7449
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.8158
## Specificity : 0.9216
## Pos Pred Value : 0.8857
## Neg Pred Value : 0.8704
## Prevalence : 0.4270
## Detection Rate : 0.3483
## Detection Prevalence : 0.3933
## Balanced Accuracy : 0.8687
##
## 'Positive' Class : 0
##
b.knn.confit(b.train2,b.test2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34 5
## 1 4 46
##
## Accuracy : 0.8989
## 95% CI : (0.8167, 0.9527)
## No Information Rate : 0.573
## P-Value [Acc > NIR] : 1.584e-11
##
## Kappa : 0.794
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8947
## Specificity : 0.9020
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.9200
## Prevalence : 0.4270
## Detection Rate : 0.3820
## Detection Prevalence : 0.4382
## Balanced Accuracy : 0.8983
##
## 'Positive' Class : 0
##
b.knn.confit(b.train20,b.test20)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 41 1
## 1 6 54
##
## Accuracy : 0.9314
## 95% CI : (0.8637, 0.972)
## No Information Rate : 0.5392
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8608
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.8723
## Specificity : 0.9818
## Pos Pred Value : 0.9762
## Neg Pred Value : 0.9000
## Prevalence : 0.4608
## Detection Rate : 0.4020
## Detection Prevalence : 0.4118
## Balanced Accuracy : 0.9271
##
## 'Positive' Class : 0
##
b.knn.confit(b.train21,b.test21)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 64 6
## 1 8 74
##
## Accuracy : 0.9079
## 95% CI : (0.8503, 0.9487)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.815
## Mcnemar's Test P-Value : 0.7893
##
## Sensitivity : 0.8889
## Specificity : 0.9250
## Pos Pred Value : 0.9143
## Neg Pred Value : 0.9024
## Prevalence : 0.4737
## Detection Rate : 0.4211
## Detection Prevalence : 0.4605
## Balanced Accuracy : 0.9069
##
## 'Positive' Class : 0
##