Project 3

Problem 1A

library(readr)
wdbc <- read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data")

names(wdbc)<-c("IDNumber", "Diagnosis", "Raduis1", "Texture1", "Perimeter1","Area1","Smoothness1","Compactness1","Concavity1","ConcavePoints1","Symmetry1","FractionalDimension1","Raduis2", "Texture2", "Perimeter2","Area2","Smoothness2","Compactness2","Concavity2","ConcavePoints2","Symmetry2","FractionalDimension2","Raduis3", "Texture3", "Perimeter3","Area3","Smoothness3","Compactness3","Concavity3","ConcavePoints3","Symmetry3","FractionalDimension3" )

wdbcFinal<-subset(wdbc, select = -c(IDNumber))

Problem 1B

library(caret)
set.seed(12345)

SepData1k<- createDataPartition (wdbcFinal$Diagnosis, p=.7, list=FALSE)

wdbcTrain<- wdbcFinal[SepData1k,]
wdbcTest<- wdbcFinal[-SepData1k,]

Problem 1C

library(e1071)

KNN<-train(Diagnosis ~., data = wdbcTrain, method= "knn", maximize= TRUE, metric = "RSquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"), tuneGrid=data.frame(.k=1:20))

Problem 1D

plot(KNN, xlab = "Number of Neighbors", ylab= "Accuracy" )

KNN

## k-Nearest Neighbors 
## 
## 398 samples
##  30 predictor
##   2 classes: 'B', 'M' 
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 358, 359, 359, 358, 358, 358, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.9450000  0.8818822
##    2  0.9399359  0.8698237
##    3  0.9599359  0.9125390
##    4  0.9548718  0.9019636
##    5  0.9598718  0.9128724
##    6  0.9598718  0.9127744
##    7  0.9523077  0.8964513
##    8  0.9548077  0.9015605
##    9  0.9523077  0.8958527
##   10  0.9573718  0.9065824
##   11  0.9548077  0.9010345
##   12  0.9548077  0.9010345
##   13  0.9598077  0.9116972
##   14  0.9573077  0.9059895
##   15  0.9523077  0.8950306
##   16  0.9573718  0.9061258
##   17  0.9548077  0.9002817
##   18  0.9548077  0.9002817
##   19  0.9548077  0.9002817
##   20  0.9548077  0.9002817
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 3.

The best K optimal is 9 with an R-Squared of .965. It is important to note there were other K neighbors R-Squared extremely close to this one.

Problem 1E

library(caret)
library(kernlab)

Knnp <- predict(KNN, newdata= wdbcTest)


confusionMatrix(table(Knnp, wdbcTest$Diagnosis))

## Confusion Matrix and Statistics
## 
##     
## Knnp   B   M
##    B 104   5
##    M   3  58
##                                           
##                Accuracy : 0.9529          
##                  95% CI : (0.9094, 0.9795)
##     No Information Rate : 0.6294          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8985          
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.9720          
##             Specificity : 0.9206          
##          Pos Pred Value : 0.9541          
##          Neg Pred Value : 0.9508          
##              Prevalence : 0.6294          
##          Detection Rate : 0.6118          
##    Detection Prevalence : 0.6412          
##       Balanced Accuracy : 0.9463          
##                                           
##        'Positive' Class : B               
##

Type 1 error probability would be 8/55 and a Type 2 error probability would be 1/106.

Problem 1F

errors<-(predict(KNN, newdata= wdbcTest)!= wdbcTest$Diagnosis)


errorsnew<-wdbcTest[which(errors== TRUE),]

predict(KNN, newdata= errorsnew, type="prob")

##           B         M
## 1 0.0000000 1.0000000
## 2 0.6666667 0.3333333
## 3 0.6666667 0.3333333
## 4 0.3333333 0.6666667
## 5 1.0000000 0.0000000
## 6 0.6666667 0.3333333
## 7 0.3333333 0.6666667
## 8 0.6666667 0.3333333

In one of the cases, there was one model claiming an accuraccy of 100% when it was wrong.

Problem 2A

library(MASS)
library(caret)

set.seed(12345)

Seperate<- createDataPartition(Boston$medv, p=.7, list=FALSE)

Testing<-Boston[Seperate,]
Training <- Boston[Seperate, ]

Problem 2B

library(caret)
library(MASS)

KNN2 <-train(medv~., data = Training, method= "knn", maximize= TRUE, metric = "Rsquared", trControl=trainControl(method ="cv", number=10))

KNN2Predict <-predict(KNN2, newdata = Testing)

KNN2RSquared <- cor(KNN2Predict, Testing$medv)^2

KNN2RSquared

## [1] 0.7154776

Problem 2C

KNN3<- train(medv~., data = Training, method= "knn", maximize= TRUE, metric = "Rsquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"))

KNN3Predict <-predict(KNN3, newdata = Testing)

KNN3RSquared <- cor(KNN3Predict, Testing$medv)^2

KNN3RSquared

## [1] 0.7919031

plot(KNN3)

Problem 2D

The r-squared increased significantly by .08.

Problem 2E

The optimal K is at 9.

Problem 3A

library(AppliedPredictiveModeling)

data(ChemicalManufacturingProcess)


sum(is.na(ChemicalManufacturingProcess))

## [1] 106

There is 106 NA’s in this dataset.

Problem 3B

library(VIM)

Chemical <-kNN(ChemicalManufacturingProcess, k=5)

sum(is.na(Chemical))

## [1] 0

Problem 3C

library(caret)

set.seed(12345)

SepData <- createDataPartition(Chemical$Yield, p=.7, list = FALSE)

ChemTest <- Chemical[SepData, ]
ChemTrain <- Chemical[SepData, ]

Problem 3D

KNN4 <- train( Yield ~ ., data = ChemTrain, method= "knn", metric = "Rsquared", trControl=trainControl(method ="cv", number=10),preProcess= c("center", "scale"), tuneGrid=data.frame(.k=1:20))


KNN4Predict <-predict(KNN4, ChemTest)

KNN4RSquared <- cor(KNN4Predict, ChemTest$Yield)^2

KNN4RSquared

## [1] 0.7568617

Problem 3E

plot(KNN4)

KNN4

## k-Nearest Neighbors 
## 
## 124 samples
## 115 predictors
## 
## Pre-processing: centered (115), scaled (115) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 111, 111, 112, 112, 111, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE      
##    1  1.460987  0.4582886  1.1251218
##    2  1.389531  0.4630519  1.0960256
##    3  1.351518  0.4703072  1.0705043
##    4  1.227563  0.5680542  0.9864808
##    5  1.301464  0.5232972  1.0331167
##    6  1.337244  0.4956426  1.0721722
##    7  1.292751  0.5262657  1.0363860
##    8  1.330814  0.4875055  1.0843779
##    9  1.371698  0.4577452  1.1293967
##   10  1.387537  0.4468868  1.1397092
##   11  1.391684  0.4462788  1.1241600
##   12  1.394426  0.4469158  1.1320746
##   13  1.399525  0.4507619  1.1321749
##   14  1.411944  0.4383077  1.1437119
##   15  1.418091  0.4342179  1.1509957
##   16  1.432723  0.4235050  1.1608584
##   17  1.431557  0.4264290  1.1677621
##   18  1.438566  0.4208805  1.1734712
##   19  1.441002  0.4264747  1.1743443
##   20  1.446564  0.4322265  1.1749258
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was k = 4.

The optimal K should be 4 since it produces the highest R-Squared of 0.5785895.

Problem 4A

library(readxl)

TechStocks <- read_excel("Data/TechStocks.xlsx")

TechStocks <-subset(TechStocks, select = -c(Date))

Logarithm<-function(x)
{ log(x[2:length(x)]/x[1:length(x)-1])
  
}

Means<-function(x)
{(mean(x)*252)
  
}

Returns <-apply(TechStocks,2,Logarithm)

Meanvector <-apply(Returns,2,Means)

CovarienceMatrix <-cov(Returns)*252

Problem 4B

library(quadprog)


annualized <-function(ReturnMatrix,Mean_vector)
{
 
Dmat <- cov(ReturnMatrix)
dvec <-rep(0,9)
bvec <-c(1)
Amat <-(matrix(1,nrow=9, ncol=1))
solution <-solve.QP(Dmat, dvec, Amat, bvec, meq=1)

solution <-solution$solution

Meanreturn <- Mean_vector%*%solution

Meanreturn

}

annualized(Returns, Meanvector)

##           [,1]
## [1,] 0.1837864

Problem 4C

library(gdata)
Sample1<-function(Returns)
{
  
Min<- apply(Returns, 2, function(x) sample(x,length(x), replace= TRUE))
Min<-matrix(Min, ncol=9)
cov(Min)
  
MeanV <-apply(Min,2,Means)

  
annualized(Min, MeanV)


  }


Sample1(Returns)

##           [,1]
## [1,] 0.1653564

Problem 4D

library(km.ci)

ration <-sort(replicate( 1000, Sample1(Returns)))

hist(ration, main ="Histogram of Sample means", xlab= "Sample Means")

Confidence <- c(ration[25], ration[1000-25])

Confidence

## [1] 0.1010873 0.3211863