library(readr)
wdbc <- read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data")
names(wdbc)<-c("IDNumber", "Diagnosis", "Raduis1", "Texture1", "Perimeter1","Area1","Smoothness1","Compactness1","Concavity1","ConcavePoints1","Symmetry1","FractionalDimension1","Raduis2", "Texture2", "Perimeter2","Area2","Smoothness2","Compactness2","Concavity2","ConcavePoints2","Symmetry2","FractionalDimension2","Raduis3", "Texture3", "Perimeter3","Area3","Smoothness3","Compactness3","Concavity3","ConcavePoints3","Symmetry3","FractionalDimension3" )
wdbcFinal<-subset(wdbc, select = -c(IDNumber))
library(caret)
set.seed(12345)
SepData1k<- createDataPartition (wdbcFinal$Diagnosis, p=.7, list=FALSE)
wdbcTrain<- wdbcFinal[SepData1k,]
wdbcTest<- wdbcFinal[-SepData1k,]
library(e1071)
KNN<-train(Diagnosis ~., data = wdbcTrain, method= "knn", maximize= TRUE, metric = "RSquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"), tuneGrid=data.frame(.k=1:20))
plot(KNN, xlab = "Number of Neighbors", ylab= "Accuracy" )
KNN
## k-Nearest Neighbors
##
## 398 samples
## 30 predictor
## 2 classes: 'B', 'M'
##
## Pre-processing: centered (30), scaled (30)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 358, 359, 359, 358, 358, 358, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9450000 0.8818822
## 2 0.9399359 0.8698237
## 3 0.9599359 0.9125390
## 4 0.9548718 0.9019636
## 5 0.9598718 0.9128724
## 6 0.9598718 0.9127744
## 7 0.9523077 0.8964513
## 8 0.9548077 0.9015605
## 9 0.9523077 0.8958527
## 10 0.9573718 0.9065824
## 11 0.9548077 0.9010345
## 12 0.9548077 0.9010345
## 13 0.9598077 0.9116972
## 14 0.9573077 0.9059895
## 15 0.9523077 0.8950306
## 16 0.9573718 0.9061258
## 17 0.9548077 0.9002817
## 18 0.9548077 0.9002817
## 19 0.9548077 0.9002817
## 20 0.9548077 0.9002817
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.
The best K optimal is 9 with an R-Squared of .965. It is important to note there were other K neighbors R-Squared extremely close to this one.
library(caret)
library(kernlab)
Knnp <- predict(KNN, newdata= wdbcTest)
confusionMatrix(table(Knnp, wdbcTest$Diagnosis))
## Confusion Matrix and Statistics
##
##
## Knnp B M
## B 104 5
## M 3 58
##
## Accuracy : 0.9529
## 95% CI : (0.9094, 0.9795)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8985
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9720
## Specificity : 0.9206
## Pos Pred Value : 0.9541
## Neg Pred Value : 0.9508
## Prevalence : 0.6294
## Detection Rate : 0.6118
## Detection Prevalence : 0.6412
## Balanced Accuracy : 0.9463
##
## 'Positive' Class : B
##
Type 1 error probability would be 8/55 and a Type 2 error probability would be 1/106.
errors<-(predict(KNN, newdata= wdbcTest)!= wdbcTest$Diagnosis)
errorsnew<-wdbcTest[which(errors== TRUE),]
predict(KNN, newdata= errorsnew, type="prob")
## B M
## 1 0.0000000 1.0000000
## 2 0.6666667 0.3333333
## 3 0.6666667 0.3333333
## 4 0.3333333 0.6666667
## 5 1.0000000 0.0000000
## 6 0.6666667 0.3333333
## 7 0.3333333 0.6666667
## 8 0.6666667 0.3333333
In one of the cases, there was one model claiming an accuraccy of 100% when it was wrong.
library(MASS)
library(caret)
set.seed(12345)
Seperate<- createDataPartition(Boston$medv, p=.7, list=FALSE)
Testing<-Boston[Seperate,]
Training <- Boston[Seperate, ]
library(caret)
library(MASS)
KNN2 <-train(medv~., data = Training, method= "knn", maximize= TRUE, metric = "Rsquared", trControl=trainControl(method ="cv", number=10))
KNN2Predict <-predict(KNN2, newdata = Testing)
KNN2RSquared <- cor(KNN2Predict, Testing$medv)^2
KNN2RSquared
## [1] 0.7154776
KNN3<- train(medv~., data = Training, method= "knn", maximize= TRUE, metric = "Rsquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"))
KNN3Predict <-predict(KNN3, newdata = Testing)
KNN3RSquared <- cor(KNN3Predict, Testing$medv)^2
KNN3RSquared
## [1] 0.7919031
plot(KNN3)
The r-squared increased significantly by .08.
The optimal K is at 9.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
There is 106 NA’s in this dataset.
library(VIM)
Chemical <-kNN(ChemicalManufacturingProcess, k=5)
sum(is.na(Chemical))
## [1] 0
library(caret)
set.seed(12345)
SepData <- createDataPartition(Chemical$Yield, p=.7, list = FALSE)
ChemTest <- Chemical[SepData, ]
ChemTrain <- Chemical[SepData, ]
KNN4 <- train( Yield ~ ., data = ChemTrain, method= "knn", metric = "Rsquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"), tuneGrid=data.frame(.k=1:20))
KNN4Predict <-predict(KNN4, ChemTest)
KNN4RSquared <- cor(KNN4Predict, ChemTest$Yield)^2
KNN4RSquared
## [1] 0.7568617
plot(KNN4)
KNN4
## k-Nearest Neighbors
##
## 124 samples
## 115 predictors
##
## Pre-processing: centered (115), scaled (115)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 111, 112, 112, 111, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.460987 0.4582886 1.1251218
## 2 1.389531 0.4630519 1.0960256
## 3 1.351518 0.4703072 1.0705043
## 4 1.227563 0.5680542 0.9864808
## 5 1.301464 0.5232972 1.0331167
## 6 1.337244 0.4956426 1.0721722
## 7 1.292751 0.5262657 1.0363860
## 8 1.330814 0.4875055 1.0843779
## 9 1.371698 0.4577452 1.1293967
## 10 1.387537 0.4468868 1.1397092
## 11 1.391684 0.4462788 1.1241600
## 12 1.394426 0.4469158 1.1320746
## 13 1.399525 0.4507619 1.1321749
## 14 1.411944 0.4383077 1.1437119
## 15 1.418091 0.4342179 1.1509957
## 16 1.432723 0.4235050 1.1608584
## 17 1.431557 0.4264290 1.1677621
## 18 1.438566 0.4208805 1.1734712
## 19 1.441002 0.4264747 1.1743443
## 20 1.446564 0.4322265 1.1749258
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 4.
The optimal K should be 4 since it produces the highest R-Squared of 0.5785895.
library(readxl)
TechStocks <- read_excel("Data/TechStocks.xlsx")
TechStocks <-subset(TechStocks, select = -c(Date))
Logarithm<-function(x)
{ log(x[2:length(x)]/x[1:length(x)-1])
}
Means<-function(x)
{(mean(x)*252)
}
Returns <-apply(TechStocks,2,Logarithm)
Meanvector <-apply(Returns,2,Means)
CovarienceMatrix <-cov(Returns)*252
library(quadprog)
annualized <-function(ReturnMatrix,Mean_vector)
{
Dmat <- cov(ReturnMatrix)
dvec <-rep(0,9)
bvec <-c(1)
Amat <-(matrix(1,nrow=9, ncol=1))
solution <-solve.QP(Dmat, dvec, Amat, bvec, meq=1)
solution <-solution$solution
Meanreturn <- Mean_vector%*%solution
Meanreturn
}
annualized(Returns, Meanvector)
## [,1]
## [1,] 0.1837864
library(gdata)
Sample1<-function(Returns)
{
Min<- apply(Returns, 2, function(x) sample(x,length(x), replace= TRUE))
Min<-matrix(Min, ncol=9)
cov(Min)
MeanV <-apply(Min,2,Means)
annualized(Min, MeanV)
}
Sample1(Returns)
## [,1]
## [1,] 0.1653564
library(km.ci)
ration <-sort(replicate( 1000, Sample1(Returns)))
hist(ration, main ="Histogram of Sample means", xlab= "Sample Means")
Confidence <- c(ration[25], ration[1000-25])
Confidence
## [1] 0.1010873 0.3211863