library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rsample)
## Loading required package: tidyr
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  2.1.3     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v purrr   0.3.3     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
cnc<-read.csv(file = "data/cancer.csv", header=T)
set.seed(100)
train_test_split <- initial_split(cnc, prop = 0.8)
train_test_split
## <456/113/569>
train_set <- training(train_test_split)
test_set <- testing(train_test_split)

#exploratory analysis. sapply checks for missing value #na.omit(variable) will delete missing values, i.e the entire row

sapply(cnc, function(x) sum(is.na(x)))
##                      id               diagnosis             radius_mean 
##                       0                       0                       0 
##            texture_mean          perimeter_mean               area_mean 
##                       0                       0                       0 
##         smoothness_mean        compactness_mean          concavity_mean 
##                       0                       0                       0 
##     concave.points_mean           symmetry_mean  fractal_dimension_mean 
##                       0                       0                       0 
##               radius_se              texture_se            perimeter_se 
##                       0                       0                       0 
##                 area_se           smoothness_se          compactness_se 
##                       0                       0                       0 
##            concavity_se       concave.points_se             symmetry_se 
##                       0                       0                       0 
##    fractal_dimension_se            radius_worst           texture_worst 
##                       0                       0                       0 
##         perimeter_worst              area_worst        smoothness_worst 
##                       0                       0                       0 
##       compactness_worst         concavity_worst    concave.points_worst 
##                       0                       0                       0 
##          symmetry_worst fractal_dimension_worst 
##                       0                       0

#id is of no use so we turn it to null. the id column is removed

train_set$id<-NULL
test_set$id<-NULL

#repeated cross validation, the ~. means all other independent variables, scale is z-score use to reduce gap between all predictors so dey ve #equal contribution.(scale dividing by total sum, center minus from the mean).

control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(diagnosis~., data=train_set, method="lvq", preProcess=c("scale","center"), trControl=control)
importance <- varImp(model, scale=FALSE)
plot(importance)

#removing those below 0.6 which re column 13,16,20,21

train_new<-train_set[,-c(13,16,20,21)]
test_new<-test_set[,-c(13,16,20,21)]
control <- trainControl("repeatedcv", number = 10, repeats = 3)

Logistic Regression

logis<- train(form=diagnosis~., data=train_new,method="glm", family="binomial", preProcess = c("center", "scale"), trControl=control,tuneLength = 5)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Support Vector Machines

svm<- train(form=diagnosis~., data=train_new, method="svmLinear", preProcess = c("center", "scale"), trControl=control, tuneLength =5)

Decision Trees

dct<- train(form=diagnosis~., data=train_new,method="rpart", metric="Accuracy",preProcess = c("center", "scale"),trControl=control, tuneLength = 5)

Predicting SVM

predsvm<-predict(svm,test_new,type="raw")
table(predsvm, test_new$diagnosis)
##        
## predsvm  B  M
##       B 71  5
##       M  1 36

Predicting Logistic

predlog<-predict(logis,test_new,type="raw")
table(predlog, test_new$diagnosis)
##        
## predlog  B  M
##       B 67  4
##       M  5 37

Predicting Decision Tree

preddct<-predict(dct,test_new,type="raw")
table(preddct, test_new$diagnosis)
##        
## preddct  B  M
##       B 66  6
##       M  6 35

ROC Curve for Surport Vector Machine

response1 <- predictor1 <- c()
response1 <- c(response1, test_new$diagnosis)
predictor1<- c(predictor1,predsvm) 


roc1 <- plot.roc(response1, predictor1,  main="ROC for SVM",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="green") 
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases

ROC Curve for Decision Tree

response2 <- predictor2 <- c()
response2 <- c(response2, test_new$diagnosis)
predictor2 <- c(predictor2, preddct)

roc2 <- plot.roc(response2, predictor2,  main="ROC for DT",
                 ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="black")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases

ROC Curve for Logistic Regression

response4 <- predictor4 <- c()
response4 <- c(response4, test_new$diagnosis)
predictor4 <- c(predictor4, predlog)

roc4<- plot.roc(response4, predictor4,  main="ROC for LR",
                ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="magenta")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases

All ROC

roc1 <- plot.roc(response1, predictor1,  main="ROC for SVM, LR and DT",
ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="green") 
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
par(new=TRUE)
roc2 <- plot.roc(response2, predictor2,  main="ROC for SVM, LR and DT",
                 ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="black")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
par(new=TRUE)
roc4<- plot.roc(response4, predictor4,  main="ROC for SVM, LR and DT",
                ylab="True Positive Rate",xlab="False Positive Rate", percent=TRUE, col="magenta")
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
legend("bottomright", legend = c("SVM", "LR",'DT'), col = c("green", "black", "magenta"),lwd = 2)