bcwd<- read.csv(file="C:\\Users\\Denise\\Documents/bcwd.csv", header = TRUE)
bcwd$V1 <- NULL
V1 was the patient ID number which we removed from our data-frame. ###B)
partition <- createDataPartition(y=bcwd$Diagnostic, p=.7, list= FALSE)
trainingSet<- bcwd[partition,]
testingSet<- bcwd[-partition,]
kNN1 <- train(Diagnostic~., data=trainingSet, method="knn",
trControl=trainControl(method="cv", number=10), preProcess=c("center","scale"), tuneGrid=data.frame(.k=1:20)
)
kNN1
## k-Nearest Neighbors
##
## 399 samples
## 31 predictor
## 2 classes: 'B', 'M'
##
## Pre-processing: centered (31), scaled (31)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 359, 360, 359, 359, 359, 359, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9524359 0.8980283
## 2 0.9599359 0.9129673
## 3 0.9625000 0.9187126
## 4 0.9549359 0.9020992
## 5 0.9675000 0.9292486
## 6 0.9675000 0.9297117
## 7 0.9675000 0.9291044
## 8 0.9600000 0.9130305
## 9 0.9600000 0.9131708
## 10 0.9700000 0.9345079
## 11 0.9625000 0.9182936
## 12 0.9650000 0.9234107
## 13 0.9625000 0.9177091
## 14 0.9650000 0.9238699
## 15 0.9599359 0.9120488
## 16 0.9625000 0.9177091
## 17 0.9624359 0.9176023
## 18 0.9549359 0.9016783
## 19 0.9549359 0.9016783
## 20 0.9625000 0.9184685
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 10.
plot(kNN1$results$k, kNN1$results$Accuracy, xlab = "k Level", ylab = "Accuracy"
)
The optimal value of k is 10 because that has the highest r-squared value.
kNN1Test<- predict(kNN1, newdata = testingSet)
confusionMatrix(kNN1Test, testingSet$Diagnostic)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 104 5
## M 3 58
##
## Accuracy : 0.9529
## 95% CI : (0.9094, 0.9795)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8985
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9720
## Specificity : 0.9206
## Pos Pred Value : 0.9541
## Neg Pred Value : 0.9508
## Prevalence : 0.6294
## Detection Rate : 0.6118
## Detection Prevalence : 0.6412
## Balanced Accuracy : 0.9463
##
## 'Positive' Class : B
##
Type I error = 1 - Specificity = 1 - 0.9206 = 0.0794
Type II error = 1- Sensitivity = 1 - 0.9720 = 0.028
We would rather have a higher probability of false positive (Type I error), so this is preferable. Our errors are both relatively low, which is not surpirsing given the amount of observations in our data set.
set.seed(12345)
partition2 <- createDataPartition(y=Boston$medv, p=.7, list= FALSE)
trainingSet2<- Boston[partition2,]
testingSet2<- Boston[-partition2,]
kNN2<- train(medv~., data=trainingSet2, method="knn",
trControl=trainControl(method="cv", number=10), tuneGrid=data.frame(.k=1:20)
)
kNN2
## k-Nearest Neighbors
##
## 356 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 321, 322, 320, 321, 320, 319, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared
## 1 7.201460 0.5265164
## 2 6.596015 0.5266002
## 3 6.247925 0.5719437
## 4 6.139918 0.5886012
## 5 6.321588 0.5619085
## 6 6.567022 0.5208536
## 7 6.614972 0.5070256
## 8 6.600537 0.5056506
## 9 6.613283 0.5052519
## 10 6.717963 0.4866685
## 11 6.832261 0.4668308
## 12 6.919215 0.4520145
## 13 6.984703 0.4410136
## 14 7.081291 0.4246618
## 15 7.143166 0.4144174
## 16 7.232036 0.3983197
## 17 7.283857 0.3902434
## 18 7.303791 0.3870002
## 19 7.340252 0.3792767
## 20 7.381562 0.3724395
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
predicted_kNN2<- predict(kNN2, newdata = testingSet2)
cor(predicted_kNN2, testingSet2$medv)^2
## [1] 0.5319884
kNN3<- train(medv~., data=trainingSet2, method="knn",
trControl=trainControl(method="cv", number=10), preProcess=c("center","scale"), tuneGrid=data.frame(.k=1:20)
)
kNN3
## k-Nearest Neighbors
##
## 356 samples
## 13 predictor
##
## Pre-processing: centered (13), scaled (13)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 320, 320, 320, 320, 322, 321, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared
## 1 4.810882 0.7442067
## 2 4.718302 0.7553453
## 3 4.634802 0.7639799
## 4 4.653894 0.7696848
## 5 4.705223 0.7642321
## 6 4.765630 0.7622143
## 7 4.781993 0.7606458
## 8 4.740353 0.7661691
## 9 4.728681 0.7666896
## 10 4.790754 0.7599721
## 11 4.817784 0.7581202
## 12 4.877277 0.7532926
## 13 4.898140 0.7504160
## 14 4.970570 0.7421852
## 15 4.984643 0.7426247
## 16 5.001537 0.7410263
## 17 5.047684 0.7382492
## 18 5.100998 0.7342969
## 19 5.120166 0.7337563
## 20 5.171352 0.7296926
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 3.
predicted_kNN3<- predict(kNN3, newdata = testingSet2)
cor(predicted_kNN3, testingSet2$medv)^2
## [1] 0.8117092
We scaled our model so that the numerical ranges of predictors are all similar. For instance, the values of crim (crime per capita) are between 0 and 2, but values of tax (property-tax rate per $10,000) are in the hundreds. It’s more effective to make our model with predictors that are closer in size. Our r-squared value increased substantially, meaning the model is a better predictor.
k = 3 is our best model since it has the lowest RMSE value.
data("ChemicalManufacturingProcess")
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
Number of missing entries = 106
imputed_CMP<- knnImputation(ChemicalManufacturingProcess, 5)
set.seed(12345)
partition3 <- createDataPartition(y=imputed_CMP$Yield, p=.7, list= FALSE)
trainingSet_CMP<- imputed_CMP[partition3,]
testingSet_CMP<- imputed_CMP[-partition3,]
kNN_CMP<- train(Yield~., data=trainingSet_CMP, method="knn",
trControl=trainControl(method="cv", number=10), preProcess=c("center","scale"), tuneGrid=data.frame(.k=1:20))
kNN_CMP
## k-Nearest Neighbors
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 110, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared
## 1 1.433272 0.4660500
## 2 1.251046 0.5727127
## 3 1.284574 0.5545142
## 4 1.228839 0.6032811
## 5 1.253296 0.5783980
## 6 1.253527 0.5726865
## 7 1.288125 0.5406860
## 8 1.311436 0.5171141
## 9 1.329873 0.4993553
## 10 1.328004 0.5011248
## 11 1.356679 0.4722567
## 12 1.372249 0.4561680
## 13 1.380990 0.4481100
## 14 1.384885 0.4442949
## 15 1.393764 0.4446986
## 16 1.401031 0.4376488
## 17 1.410588 0.4301057
## 18 1.420762 0.4198853
## 19 1.424922 0.4239067
## 20 1.431547 0.4212337
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
predicted_CMP<- predict(kNN_CMP, newdata = testingSet_CMP)
cor(predicted_CMP, testingSet_CMP$Yield)^2
## [1] 0.4344609
k = 4 is optimal because it has the highest r-squared value and the smallest RMSE value.
TechStocks <- read.csv(file="C:\\Users\\Denise\\Documents/TechStocks.csv", header = TRUE)
TechStocks <- TechStocks[, -1]
logratio <- function(x){
log(x[2:length(x)]/x[1:length(x)-1])
}
returns<- apply(TechStocks, 2, logratio)
means<- apply(returns, 2, mean)*252
cov_returns <- cov(returns)*252
mean_return <- function(mean, covariance){
Amat <- matrix(c(1,1,1,1,1,1,1,1,1), ncol=1)
solutions<- solve.QP(covariance, c(0,0,0,0,0,0,0,0,0), Amat,c(1), meq=1)
solutions$solution%*%mean
}
mean_return(means,cov_returns)
## [,1]
## [1,] 0.1837864
resample<- function(){
samp_returns<-apply(returns, 2, function(x){sample(x, length(x), replace = TRUE)})
samp_mean <- apply(samp_returns, 2, mean)*252
sampcov_returns <- cov(samp_returns)*252
mean_return(samp_mean, sampcov_returns)
}
Tech_samples<- sort(replicate(1000, resample()))
hist(Tech_samples)
c(Tech_samples[25], Tech_samples[975])
## [1] 0.1097394 0.3202161