library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rsample)
## Loading required package: tidyr
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v purrr 0.3.3 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
cnc<-read.csv(file = "data/cancer.csv", header=T)
set.seed(100)
train_test_split <- initial_split(cnc, prop = 0.8)
train_test_split
## <456/113/569>
train_set <- training(train_test_split)
test_set <- testing(train_test_split)
#exploratory analysis. sapply checks for missing value #na.omit(variable) will delete missing values, i.e the entire row
sapply(cnc, function(x) sum(is.na(x)))
## id diagnosis radius_mean
## 0 0 0
## texture_mean perimeter_mean area_mean
## 0 0 0
## smoothness_mean compactness_mean concavity_mean
## 0 0 0
## concave.points_mean symmetry_mean fractal_dimension_mean
## 0 0 0
## radius_se texture_se perimeter_se
## 0 0 0
## area_se smoothness_se compactness_se
## 0 0 0
## concavity_se concave.points_se symmetry_se
## 0 0 0
## fractal_dimension_se radius_worst texture_worst
## 0 0 0
## perimeter_worst area_worst smoothness_worst
## 0 0 0
## compactness_worst concavity_worst concave.points_worst
## 0 0 0
## symmetry_worst fractal_dimension_worst
## 0 0
#id is of no use so we turn it to null. the id column is removed
train_set$id<-NULL
test_set$id<-NULL
#repeated cross validation, the ~. means all other independent variables, scale is z-score use to reduce gap between all predictors so dey ve #equal contribution.(scale dividing by total sum, center minus from the mean).
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(diagnosis~., data=train_set, method="lvq", preProcess=c("scale","center"), trControl=control)
importance <- varImp(model, scale=FALSE)
plot(importance)
#removing those below 0.6 which re column 13,16,20,21
train_new<-train_set[,-c(13,16,20,21)]
test_new<-test_set[,-c(13,16,20,21)]
control <- trainControl("repeatedcv", number = 10, repeats = 3)
Logistic Regression
logis<- train(form=diagnosis~., data=train_new,method="glm", family="binomial", preProcess = c("center", "scale"), trControl=control,tuneLength = 5)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Support Vector Machines
svm<- train(form=diagnosis~., data=train_new, method="svmLinear", preProcess = c("center", "scale"), trControl=control, tuneLength =5)
Decision Trees
dct<- train(form=diagnosis~., data=train_new,method="rpart", metric="Accuracy",preProcess = c("center", "scale"),trControl=control, tuneLength = 5)
predsvm<-predict(svm,test_new,type="raw")
table(predsvm, test_new$diagnosis)
##
## predsvm B M
## B 71 5
## M 1 36
predlog<-predict(logis,test_new,type="raw")
table(predlog, test_new$diagnosis)
##
## predlog B M
## B 67 4
## M 5 37
preddct<-predict(dct,test_new,type="raw")
table(preddct, test_new$diagnosis)
##
## preddct B M
## B 66 6
## M 6 35
response1 <- predictor1 <- c()
response1 <- c(response1, test_new$diagnosis)
predictor1<- c(predictor1,predsvm)
roc1 <- plot.roc(response1, predictor1, main="ROC for SVM",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="green")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
response2 <- predictor2 <- c()
response2 <- c(response2, test_new$diagnosis)
predictor2 <- c(predictor2, preddct)
roc2 <- plot.roc(response2, predictor2, main="ROC for DT",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="black")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
response4 <- predictor4 <- c()
response4 <- c(response4, test_new$diagnosis)
predictor4 <- c(predictor4, predlog)
roc4<- plot.roc(response4, predictor4, main="ROC for LR",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="magenta")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
All ROC
roc1 <- plot.roc(response1, predictor1, main="ROC for SVM, LR and DT",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="green")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
par(new=TRUE)
roc2 <- plot.roc(response2, predictor2, main="ROC for SVM, LR and DT",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="black")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
par(new=TRUE)
roc4<- plot.roc(response4, predictor4, main="ROC for SVM, LR and DT",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="magenta")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
legend("bottomright", legend = c("SVM", "LR",'DT'), col = c("green", "black", "magenta"),lwd = 2)