This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Import data
cancer <-read.csv(file="breast-cancer.data",sep=",",header=F)
#Assign column names
colnames(cancer)<- c("Class","Age","Menopause","Tumoursize","Inv_nodes","Node_caps","Deg_malig","Breast","Breast_quad","Irradiat")
View(cancer)
summary(cancer)
## Class Age Menopause Tumoursize
## no-recurrence-events:201 20-29: 1 ge40 :129 30-34 :60
## recurrence-events : 85 30-39:36 lt40 : 7 25-29 :54
## 40-49:90 premeno:150 20-24 :50
## 50-59:96 15-19 :30
## 60-69:57 10-14 :28
## 70-79: 6 40-44 :22
## (Other):42
## Inv_nodes Node_caps Deg_malig Breast Breast_quad
## 0-2 :213 ? : 8 Min. :1.000 left :152 ? : 1
## 12-14: 3 no :222 1st Qu.:2.000 right:134 central : 21
## 15-17: 6 yes: 56 Median :2.000 left_low :110
## 24-26: 1 Mean :2.049 left_up : 97
## 3-5 : 36 3rd Qu.:3.000 right_low: 24
## 6-8 : 17 Max. :3.000 right_up : 33
## 9-11 : 10
## Irradiat
## no :218
## yes: 68
##
##
##
##
##
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(cancer)
## 'data.frame': 286 obs. of 10 variables:
## $ Class : Factor w/ 2 levels "no-recurrence-events",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : Factor w/ 6 levels "20-29","30-39",..: 2 3 3 5 3 5 4 5 3 3 ...
## $ Menopause : Factor w/ 3 levels "ge40","lt40",..: 3 3 3 1 3 1 3 1 3 3 ...
## $ Tumoursize : Factor w/ 11 levels "0-4","10-14",..: 6 4 4 3 1 3 5 4 11 4 ...
## $ Inv_nodes : Factor w/ 7 levels "0-2","12-14",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Node_caps : Factor w/ 3 levels "?","no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Deg_malig : int 3 2 2 2 2 2 2 1 2 2 ...
## $ Breast : Factor w/ 2 levels "left","right": 1 2 1 2 2 1 1 1 1 2 ...
## $ Breast_quad: Factor w/ 6 levels "?","central",..: 3 6 3 4 5 3 3 3 3 4 ...
## $ Irradiat : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
glimpse(cancer)
## Observations: 286
## Variables: 10
## $ Class <fct> no-recurrence-events, no-recurrence-events, no-rec...
## $ Age <fct> 30-39, 40-49, 40-49, 60-69, 40-49, 60-69, 50-59, 6...
## $ Menopause <fct> premeno, premeno, premeno, ge40, premeno, ge40, pr...
## $ Tumoursize <fct> 30-34, 20-24, 20-24, 15-19, 0-4, 15-19, 25-29, 20-...
## $ Inv_nodes <fct> 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, 0-2, ...
## $ Node_caps <fct> no, no, no, no, no, no, no, no, no, no, no, no, no...
## $ Deg_malig <int> 3, 2, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 1, 3, 3, 1, 2,...
## $ Breast <fct> left, right, left, right, right, left, left, left,...
## $ Breast_quad <fct> left_low, right_up, left_low, left_up, right_low, ...
## $ Irradiat <fct> no, no, no, no, no, no, no, no, no, no, no, no, no...
dim(cancer)
## [1] 286 10
You can also embed plots, for example:
library(ggplot2)
#Bar plot for class and Irradiat
ggplot(cancer,aes(x=Class,fill=Irradiat))+geom_bar(position ="dodge")
#Bar plot for Tumoursize and Menopause
ggplot(cancer,aes(x=Tumoursize,fill=Menopause))+geom_bar(position="dodge")
#Bar plot for Age and Tumoursize with facet grid
ggplot(cancer,aes(x=Age,fill=Tumoursize))+geom_bar()+facet_grid(Age~Tumoursize)
#Bar plot for Age and Tumoursize
ggplot(cancer,aes(x=Age,fill=Tumoursize))+geom_bar(position ="dodge")
#Pie chart for Menopause using coord_polar
ggplot(cancer,aes(x="",fill=Menopause)) + geom_bar(width=3)+coord_polar("y")
#Histogram for deg_malig using hist
hist(as.numeric(cancer$Deg_malig))
#Density plot between breast and tumorsize
ggplot(cancer,aes(x=Tumoursize,fill=Breast))+geom_density(alpha=0.3)
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
#Density plot between breast_quad and tumoursize
ggplot(cancer,aes(x=Tumoursize,fill=Breast_quad))+geom_density()
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.6.1
age=as.numeric(cancer$Age)
tumoursize=as.numeric(cancer$Tumoursize)
inv_nodes=as.numeric(cancer$Inv_nodes)
deg_malig=as.numeric(cancer$Deg_malig)
#Creating correlogram dataset
corr_dataset <- data.frame(age,tumoursize,inv_nodes,deg_malig)
corr<-round(cor(corr_dataset),1)
#Correlogram chart
ggcorrplot(corr,hc.order=T,method="circle",lab=T,type="lower")
#MODEL BUILDING
#supervised
#LOGISTIC REGRESSION
log_data <-cancer[c(1,7,10)]
View(log_data)
cancer$Deg_malig=factor(cancer$Deg_malig,levels=c(1,2,3))
#shuffling
set.seed(1000)
shuf_ind <-sample(1:nrow(log_data))
log_data <-log_data[shuf_ind,]
#splitting data
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.1
set.seed(1234)
split <-sample.split(log_data$Irradiat,SplitRatio =0.8 )
#training set
training_set <-subset(log_data,split==T)
#test set
test_set <-subset(log_data,split==F)
View(training_set)
View(test_set)
#generalised linear model
classifier <-glm(formula=Irradiat~. ,family = binomial(),data=training_set)
summary(classifier)
##
## Call:
## glm(formula = Irradiat ~ ., family = binomial(), data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0722 -0.6618 -0.6402 -0.4903 2.0873
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6354 0.5361 -4.915 8.86e-07 ***
## Classrecurrence-events 0.6510 0.3518 1.851 0.0642 .
## Deg_malig 0.5772 0.2428 2.377 0.0174 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 249.62 on 227 degrees of freedom
## Residual deviance: 234.98 on 225 degrees of freedom
## AIC: 240.98
##
## Number of Fisher Scoring iterations: 4
#probability predict
prob_pred <-predict(classifier,type="response",newdata=test_set[-3])
prob_pred
## 29 186 35 70 203 91 108
## 0.1852918 0.1852918 0.1132330 0.1852918 0.1966856 0.1852918 0.1852918
## 275 31 248 42 268 28 69
## 0.3036657 0.2882987 0.4371676 0.1852918 0.4371676 0.2882987 0.2882987
## 226 57 179 32 234 168 222
## 0.4371676 0.1132330 0.1852918 0.1852918 0.1966856 0.1852918 0.1966856
## 5 2 146 282 195 55 266
## 0.1852918 0.1852918 0.1852918 0.3036657 0.1852918 0.1132330 0.4371676
## 103 208 210 20 4 45 123
## 0.1852918 0.3036657 0.1966856 0.1132330 0.1852918 0.1132330 0.2882987
## 237 8 172 153 19 212 141
## 0.3036657 0.1132330 0.1132330 0.2882987 0.2882987 0.3036657 0.1852918
## 149 199 43 261 134 67 12
## 0.2882987 0.1852918 0.1132330 0.3036657 0.2882987 0.1132330 0.1852918
## 14 175 50 211 11 137 235
## 0.2882987 0.1852918 0.1132330 0.3036657 0.2882987 0.2882987 0.4371676
## 155 255
## 0.2882987 0.4371676
y_pred <- ifelse(prob_pred>0.4,1,0)
y_pred
## 29 186 35 70 203 91 108 275 31 248 42 268 28 69 226 57 179 32
## 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0
## 234 168 222 5 2 146 282 195 55 266 103 208 210 20 4 45 123 237
## 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 8 172 153 19 212 141 149 199 43 261 134 67 12 14 175 50 211 11
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 137 235 155 255
## 0 1 0 1
#confusion matrix
ctable<-table(test_set[,3],y_pred)
#accuary
accuracy=sum(diag(ctable))/sum(ctable)
#error rate
err_rate <-1-accuracy
#Fourfold plot for accuracy and error rate in confusion matrix
fourfoldplot(ctable, color = c("red", "green"),
conf.level =0, main = "Confusion Matrix")
Note that the echo = TRUE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.