wdbc <- fread('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data')
head(wdbc)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1: 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710
## 2: 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017
## 3: 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790
## 4: 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520
## 5: 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430
## 6: 843786 M 12.45 15.70 82.57 477.1 0.12780 0.17000 0.1578 0.08089
## V11 V12 V13 V14 V15 V16 V17 V18 V19
## 1: 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373
## 2: 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860
## 3: 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832
## 4: 0.2597 0.09744 0.4956 1.1560 3.445 27.23 0.009110 0.07458 0.05661
## 5: 0.1809 0.05883 0.7572 0.7813 5.438 94.44 0.011490 0.02461 0.05688
## 6: 0.2087 0.07613 0.3345 0.8902 2.217 27.19 0.007510 0.03345 0.03672
## V20 V21 V22 V23 V24 V25 V26 V27 V28 V29
## 1: 0.01587 0.03003 0.006193 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119
## 2: 0.01340 0.01389 0.003532 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416
## 3: 0.02058 0.02250 0.004571 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504
## 4: 0.01867 0.05963 0.009208 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869
## 5: 0.01885 0.01756 0.005115 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000
## 6: 0.01137 0.02165 0.005082 15.47 23.75 103.40 741.6 0.1791 0.5249 0.5355
## V30 V31 V32
## 1: 0.2654 0.4601 0.11890
## 2: 0.1860 0.2750 0.08902
## 3: 0.2430 0.3613 0.08758
## 4: 0.2575 0.6638 0.17300
## 5: 0.1625 0.2364 0.07678
## 6: 0.1741 0.3985 0.12440
wdbc$V1 <- NULL
Removed the patient ID column from the data set.
set.seed(12345)
dp <- createDataPartition(wdbc$V2, p=0.7, list=FALSE)
training <- wdbc[dp,]
testing <- wdbc[-dp,]
Set the seed for ease of reproduction, created training and testing data partitions.
kNN1 <- train(V2~., method = "knn", data = training, trControl = trainControl(method = "cv", number = 10), preProcess = c("center","scale"), tuneGrid = data.frame(.k=1:20))
Made a k-nearest neighbors model testing values from 1-20 for k. Centered and scaled the predictors.
plot(kNN1$results$k, kNN1$results$Accuracy, xlab = "k", ylab = "Accuracy")
As seen in the plot, when k=6 it has the highest accuracy, making 6 the optimal value for k.
kNN1.predicted <- predict(kNN1, newdata = testing)
confusionMatrix(kNN1.predicted, testing$V2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 103 5
## M 4 58
##
## Accuracy : 0.9471
## 95% CI : (0.9019, 0.9755)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8861
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9626
## Specificity : 0.9206
## Pos Pred Value : 0.9537
## Neg Pred Value : 0.9355
## Prevalence : 0.6294
## Detection Rate : 0.6059
## Detection Prevalence : 0.6353
## Balanced Accuracy : 0.9416
##
## 'Positive' Class : B
##
Made a predicted model based off of kNN1 using the testing data. Then generated a confusion matrix comparing the predicted values from KNN1.predicted and the actual values from the testing data.
Based on the confusion matrix: Alpha (Type I Error) = 5/63 Beta (Type II Error) = 4/107
5/63
## [1] 0.07936508
4/107
## [1] 0.03738318
The failures of the testing data are shown above, Type I Error probability = 7.94% and Type II Error probability = 3.74%. Both are less than 8% which is not too surprising as the predicted model seems to be rather accurate.
set.seed(12345)
dp2 <- createDataPartition(Boston$medv, p=0.7, list=FALSE)
training <- Boston[dp2,]
testing <- Boston[-dp2,]
Set seed and created testing and training partitions of the Boston data.
kNN2 <- train(medv~., method = "knn", data = training, trControl = trainControl(method = "cv", number = 10))
cor(testing$medv, predict(kNN2, newdata = testing))^2
## [1] 0.489145
Created a k-nearest neighbors model for the medv variable using all other variables as predictors. kNN2 did not center and scale the predictors.
As seen above, R2 using the testing data with kNN2 is rather low at 0.489145.
kNN3 <- train(medv~., method = "knn", data = training, trControl = trainControl(method = "cv", number = 10), preProcess = c("center","scale"))
cor(testing$medv, predict(kNN3, newdata = testing))^2
## [1] 0.7969769
Created another k-nearest neighbors model for the medv variable using all other variables as predictors. This time the predictors were centered and scaled.
As seen above, R2 using the testing data with kNN3 is much higher 0.7969769.
Scaling and centering the predictors made a huge difference as seen by the R2 values produced from the respective models above. In kNN3, where the predictors were scaled and centered, the predictive ability of the model was much higher than kNN2.
kNN3$results
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 5 4.848538 0.7364466 3.162348 1.152061 0.1465797 0.4490481
## 2 7 4.818407 0.7450562 3.180713 1.124558 0.1199281 0.3722370
## 3 9 4.776094 0.7500352 3.106382 1.336867 0.1266857 0.4753765
Based off the information above, 9 is the optimal value as the RMSE for k=9 is the lowest of the three given.
data("ChemicalManufacturingProcess")
CMP <- ChemicalManufacturingProcess
sum(is.na(CMP))
## [1] 106
Calculated the number of “NA” entries in CMP. There is 106 such entries.
CMP <- knnImputation(CMP, k=5)
Replaced the NA entries using the knnImputation function with k=5.
set.seed(12345)
dp3 <- createDataPartition(CMP$Yield, p=0.7, list=FALSE)
training <- CMP[dp3,]
testing <- CMP[-dp3,]
Set the seed and created testing and training data partitions using the newly “NA”-free dataset.
kNN4 <- train(Yield~., method = "knn", data = training, trControl = trainControl(method = "cv", number = 10), preProcess = c("center","scale"), tuneGrid = data.frame(.k=1:20))
cor(testing$Yield, predict(kNN4, newdata = testing))^2
## [1] 0.4344609
Created a k-nearest neighbors model for the Yield variable, centering and scaling the predictors and trying values from 1-20 for k.
As seen above, R2 using the testing data with kNN3 is 0.4344609.
kNN4$results
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 1.427711 0.4644080 1.1057308 0.4483978 0.2368294 0.3181594
## 2 2 1.365624 0.4725430 1.0805609 0.3448858 0.2119629 0.2827429
## 3 3 1.334300 0.4907510 1.0533397 0.2961471 0.2236947 0.2208984
## 4 4 1.225791 0.5761424 0.9866593 0.2787682 0.1851034 0.2200482
## 5 5 1.278859 0.5495754 1.0169051 0.2830547 0.1710121 0.2216175
## 6 6 1.308214 0.5229591 1.0356905 0.2339732 0.1288105 0.1584882
## 7 7 1.296904 0.5371365 1.0484542 0.2381766 0.1406646 0.1611279
## 8 8 1.309094 0.5141480 1.0572115 0.2350217 0.1535222 0.1677642
## 9 9 1.342208 0.4932970 1.1037261 0.2429742 0.1672224 0.1883707
## 10 10 1.376904 0.4587657 1.1308800 0.2207559 0.1499499 0.1732258
## 11 11 1.373152 0.4681376 1.1202925 0.1983151 0.1275934 0.1583883
## 12 12 1.397733 0.4467969 1.1362206 0.1960747 0.1134860 0.1529012
## 13 13 1.385955 0.4640259 1.1253542 0.2100404 0.1026455 0.1673762
## 14 14 1.377975 0.4761592 1.1187266 0.2131420 0.1180236 0.1741279
## 15 15 1.404477 0.4508466 1.1482355 0.2170641 0.1259262 0.1786018
## 16 16 1.414462 0.4372621 1.1581106 0.2146812 0.1185752 0.1844132
## 17 17 1.412914 0.4450490 1.1545351 0.2086262 0.1098979 0.1833349
## 18 18 1.435392 0.4250024 1.1703251 0.2184785 0.1177511 0.1934116
## 19 19 1.440372 0.4235065 1.1728085 0.2064806 0.1163990 0.1802061
## 20 20 1.444242 0.4274910 1.1778401 0.2154125 0.1144566 0.1823835
Based off the information above, 4 is the optimal value as the RMSE for k=4 is the lowest of the values 1-20.
TechStocks <- read.csv(file="/Users/Alex/Dropbox/College/4-Senior/Machine Learning/Project3/TechStocks.csv", header=TRUE, sep=",")
TechStocks$Date <- NULL
logRatio = function(x) { log(x[2:length(x)]/x[1:length(x)-1]) }
returns = as.data.frame(apply(TechStocks, 2, logRatio))
annualizedMeans = colMeans(returns)*252
annualizedCovMatrix = cov(returns)*252
Imported the data and removed the date column. Computed the returns, annualized means of the returns, and the annualized covariance matrix.
dvec <- matrix(0,9)
amat <- matrix(1,9,1)
b <- c(1)
solution <- solve.QP(annualizedCovMatrix, dvec, amat, b, meq=1)
meanReturn = sum(solution$solution * annualizedMeans)
meanReturn
## [1] 0.1837864
Computes the mean return for the data from TechStocks.
SampleCol = function(col) { sample(col, length(col), replace=TRUE) }
ResampleReturn = function(returns) {
reSamp = as.data.frame(apply(returns, 2, SampleCol))
annualizedMeans = colMeans(reSamp)*252
annualizedCovMatrix = cov(reSamp)*252
dvec = rep(0,9)
amat = matrix(1, 9, 1)
b = c(1)
solution = solve.QP(annualizedCovMatrix, dvec, amat, b, meq = 1 )
meanReturn = sum(solution$solution*annualizedMeans)
}
The resampling functions to use in part D.
MinRiskMeanReturn = sort(replicate(1000, ResampleReturn(returns)))
confidence.level = 0.95
lowerlevel = (1-confidence.level)/2
upperlevel = confidence.level + (1-confidence.level)/2
ci <- quantile(MinRiskMeanReturn, probs = c(lowerlevel, upperlevel))
ci
## 2.5% 97.5%
## 0.1032236 0.3212093
hist(MinRiskMeanReturn)
Did 1000 resamples and created a histogram using that data of the mean returns. Then computed a 95% confidence interval for the resampled mean returns.