Breast Cancer Analysis

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#Import data
cancer <-read.csv(file="breast-cancer.data",sep=",",header=F)
#Assign column names
colnames(cancer)<- c("Class","Age","Menopause","Tumoursize","Inv_nodes","Node_caps","Deg_malig","Breast","Breast_quad","Irradiat")


View(cancer)
summary(cancer)

##                   Class        Age       Menopause     Tumoursize
##  no-recurrence-events:201   20-29: 1   ge40   :129   30-34  :60  
##  recurrence-events   : 85   30-39:36   lt40   :  7   25-29  :54  
##                             40-49:90   premeno:150   20-24  :50  
##                             50-59:96                 15-19  :30  
##                             60-69:57                 10-14  :28  
##                             70-79: 6                 40-44  :22  
##                                                      (Other):42  
##  Inv_nodes   Node_caps   Deg_malig       Breast       Breast_quad 
##  0-2  :213   ?  :  8   Min.   :1.000   left :152   ?        :  1  
##  12-14:  3   no :222   1st Qu.:2.000   right:134   central  : 21  
##  15-17:  6   yes: 56   Median :2.000               left_low :110  
##  24-26:  1             Mean   :2.049               left_up  : 97  
##  3-5  : 36             3rd Qu.:3.000               right_low: 24  
##  6-8  : 17             Max.   :3.000               right_up : 33  
##  9-11 : 10                                                        
##  Irradiat 
##  no :218  
##  yes: 68  
##           
##           
##           
##           
##

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

str(cancer)

## 'data.frame':    286 obs. of  10 variables:
##  $ Class      : Factor w/ 2 levels "no-recurrence-events",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age        : Factor w/ 6 levels "20-29","30-39",..: 2 3 3 5 3 5 4 5 3 3 ...
##  $ Menopause  : Factor w/ 3 levels "ge40","lt40",..: 3 3 3 1 3 1 3 1 3 3 ...
##  $ Tumoursize : Factor w/ 11 levels "0-4","10-14",..: 6 4 4 3 1 3 5 4 11 4 ...
##  $ Inv_nodes  : Factor w/ 7 levels "0-2","12-14",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Node_caps  : Factor w/ 3 levels "?","no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Deg_malig  : int  3 2 2 2 2 2 2 1 2 2 ...
##  $ Breast     : Factor w/ 2 levels "left","right": 1 2 1 2 2 1 1 1 1 2 ...
##  $ Breast_quad: Factor w/ 6 levels "?","central",..: 3 6 3 4 5 3 3 3 3 4 ...
##  $ Irradiat   : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

glimpse(cancer)

## Observations: 286
## Variables: 10
## $ Class       <fct> no-recurrence-events, no-recurrence-events, no-rec...
## $ Age         <fct> 30-39, 40-49, 40-49, 60-69, 40-49, 60-69, 50-59, 6...
## $ Menopause   <fct> premeno, premeno, premeno, ge40, premeno, ge40, pr...
## $ Tumoursize  <fct> 30-34, 20-24, 20-24, 15-19, 0-4, 15-19, 25-29, 20-...
## $ Inv_nodes   <fct> 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, ...
## $ Node_caps   <fct> no, no, no, no, no, no, no, no, no, no, no, no, no...
## $ Deg_malig   <int> 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 1, 3, 3, 1, 2,...
## $ Breast      <fct> left, right, left, right, right, left, left, left,...
## $ Breast_quad <fct> left_low, right_up, left_low, left_up, right_low, ...
## $ Irradiat    <fct> no, no, no, no, no, no, no, no, no, no, no, no, no...

dim(cancer)

## [1] 286  10

Including Plots

You can also embed plots, for example:

library(ggplot2)
#Bar plot for class and Irradiat
ggplot(cancer,aes(x=Class,fill=Irradiat))+geom_bar(position ="dodge")

#Bar plot for Tumoursize and Menopause
ggplot(cancer,aes(x=Tumoursize,fill=Menopause))+geom_bar(position="dodge")

#Bar plot for Age and Tumoursize with facet grid
ggplot(cancer,aes(x=Age,fill=Tumoursize))+geom_bar()+facet_grid(Age~Tumoursize)

#Bar plot for Age and Tumoursize
ggplot(cancer,aes(x=Age,fill=Tumoursize))+geom_bar(position ="dodge")

#Pie chart for Menopause using coord_polar
ggplot(cancer,aes(x="",fill=Menopause)) + geom_bar(width=3)+coord_polar("y")

#Histogram for deg_malig using hist
hist(as.numeric(cancer$Deg_malig))

#Density plot between breast and tumorsize
ggplot(cancer,aes(x=Tumoursize,fill=Breast))+geom_density(alpha=0.3)

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

#Density plot between breast_quad and tumoursize
ggplot(cancer,aes(x=Tumoursize,fill=Breast_quad))+geom_density()

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 3.6.1

age=as.numeric(cancer$Age)
tumoursize=as.numeric(cancer$Tumoursize)
inv_nodes=as.numeric(cancer$Inv_nodes)
deg_malig=as.numeric(cancer$Deg_malig)

#Creating correlogram dataset
corr_dataset <- data.frame(age,tumoursize,inv_nodes,deg_malig)
corr<-round(cor(corr_dataset),1)

#Correlogram chart
ggcorrplot(corr,hc.order=T,method="circle",lab=T,type="lower")

#MODEL BUILDING
#supervised
#LOGISTIC REGRESSION

log_data <-cancer[c(1,7,10)]
View(log_data)
cancer$Deg_malig=factor(cancer$Deg_malig,levels=c(1,2,3))



#shuffling
set.seed(1000)
shuf_ind <-sample(1:nrow(log_data))
log_data <-log_data[shuf_ind,]

#splitting data
library(caTools)

## Warning: package 'caTools' was built under R version 3.6.1

set.seed(1234)
split <-sample.split(log_data$Irradiat,SplitRatio =0.8 )

#training set
training_set <-subset(log_data,split==T)

#test set
test_set <-subset(log_data,split==F)

View(training_set)
View(test_set)

#generalised linear model
classifier <-glm(formula=Irradiat~. ,family = binomial(),data=training_set)
summary(classifier)

## 
## Call:
## glm(formula = Irradiat ~ ., family = binomial(), data = training_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0722  -0.6618  -0.6402  -0.4903   2.0873  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -2.6354     0.5361  -4.915 8.86e-07 ***
## Classrecurrence-events   0.6510     0.3518   1.851   0.0642 .  
## Deg_malig                0.5772     0.2428   2.377   0.0174 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 249.62  on 227  degrees of freedom
## Residual deviance: 234.98  on 225  degrees of freedom
## AIC: 240.98
## 
## Number of Fisher Scoring iterations: 4

#probability predict
prob_pred <-predict(classifier,type="response",newdata=test_set[-3])
prob_pred

##        29       186        35        70       203        91       108 
## 0.1852918 0.1852918 0.1132330 0.1852918 0.1966856 0.1852918 0.1852918 
##       275        31       248        42       268        28        69 
## 0.3036657 0.2882987 0.4371676 0.1852918 0.4371676 0.2882987 0.2882987 
##       226        57       179        32       234       168       222 
## 0.4371676 0.1132330 0.1852918 0.1852918 0.1966856 0.1852918 0.1966856 
##         5         2       146       282       195        55       266 
## 0.1852918 0.1852918 0.1852918 0.3036657 0.1852918 0.1132330 0.4371676 
##       103       208       210        20         4        45       123 
## 0.1852918 0.3036657 0.1966856 0.1132330 0.1852918 0.1132330 0.2882987 
##       237         8       172       153        19       212       141 
## 0.3036657 0.1132330 0.1132330 0.2882987 0.2882987 0.3036657 0.1852918 
##       149       199        43       261       134        67        12 
## 0.2882987 0.1852918 0.1132330 0.3036657 0.2882987 0.1132330 0.1852918 
##        14       175        50       211        11       137       235 
## 0.2882987 0.1852918 0.1132330 0.3036657 0.2882987 0.2882987 0.4371676 
##       155       255 
## 0.2882987 0.4371676

y_pred <- ifelse(prob_pred>0.4,1,0)
y_pred

##  29 186  35  70 203  91 108 275  31 248  42 268  28  69 226  57 179  32 
##   0   0   0   0   0   0   0   0   0   1   0   1   0   0   1   0   0   0 
## 234 168 222   5   2 146 282 195  55 266 103 208 210  20   4  45 123 237 
##   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0 
##   8 172 153  19 212 141 149 199  43 261 134  67  12  14 175  50 211  11 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 137 235 155 255 
##   0   1   0   1

#confusion matrix
ctable<-table(test_set[,3],y_pred)


#accuary
accuracy=sum(diag(ctable))/sum(ctable)
#error rate
err_rate <-1-accuracy

#Fourfold plot for accuracy and error rate in confusion matrix
fourfoldplot(ctable, color = c("red", "green"),
             conf.level =0, main = "Confusion Matrix")

Note that the echo = TRUE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Breast Cancer Analysis

Priyanka,Anitha

10/8/2019

R Markdown

Including Plots