Gehad Gad
February 2nd, 2020
Assignment 1
Introduction This data is about Breast Cancer findings, obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This data set includes 201 instances of one class and 85 instances of another class. The instances are described by 9 attributes, some of which are linear and some are nominal.
#Import libraries and/or Packages
if (!require(dplyr)){
install.packages("dplyr")
library(dplyr)}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require(caTools)){
install.packages("caTools")
library(caTools)}
## Loading required package: caTools
## Warning: package 'caTools' was built under R version 3.6.2
if (!require(ggplot2)){
install.packages("ggplot2")
library(ggplot2)}
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
Read the data into R.Name the features of the data.
BreastCancer <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", header=FALSE)
names (BreastCancer)<-c("output","age","menopause","tumor_size","inv_nodes","node_caps", "deg_malig","breast","breast_quad","irradiat")
Display few rows from the data.
head(BreastCancer)
## output age menopause tumor_size inv_nodes node_caps
## 1 no-recurrence-events 30-39 premeno 30-34 0-2 no
## 2 no-recurrence-events 40-49 premeno 20-24 0-2 no
## 3 no-recurrence-events 40-49 premeno 20-24 0-2 no
## 4 no-recurrence-events 60-69 ge40 15-19 0-2 no
## 5 no-recurrence-events 40-49 premeno 0-4 0-2 no
## 6 no-recurrence-events 60-69 ge40 15-19 0-2 no
## deg_malig breast breast_quad irradiat
## 1 3 left left_low no
## 2 2 right right_up no
## 3 2 left left_low no
## 4 2 right left_up no
## 5 2 right right_low no
## 6 2 left left_low no
#Display the head of the data.
summary(BreastCancer)
## output age menopause tumor_size
## no-recurrence-events:201 20-29: 1 ge40 :129 30-34 :60
## recurrence-events : 85 30-39:36 lt40 : 7 25-29 :54
## 40-49:90 premeno:150 20-24 :50
## 50-59:96 15-19 :30
## 60-69:57 10-14 :28
## 70-79: 6 40-44 :22
## (Other):42
## inv_nodes node_caps deg_malig breast breast_quad
## 0-2 :213 ? : 8 Min. :1.000 left :152 ? : 1
## 12-14: 3 no :222 1st Qu.:2.000 right:134 central : 21
## 15-17: 6 yes: 56 Median :2.000 left_low :110
## 24-26: 1 Mean :2.049 left_up : 97
## 3-5 : 36 3rd Qu.:3.000 right_low: 24
## 6-8 : 17 Max. :3.000 right_up : 33
## 9-11 : 10
## irradiat
## no :218
## yes: 68
##
##
##
##
##
#check for NA values in the data.
sum(is.na(BreastCancer))
## [1] 0
Also, there is not NA showing, there are some ? in the date appears in the summary view.
#Replace ? values in node_caps columns:
BreastCancer$node_caps[BreastCancer$node_caps=="?"]="yes"
#Replace any NA's values in node_caps
BreastCancer$node_caps=ifelse(is.na(BreastCancer$node_caps),ave(BreastCancer$node_caps,FUN=function(x)"no"),BreastCancer$node_caps)
#Replace any ? values in breast_quad column.
BreastCancer$breast_quad[BreastCancer$breast_quad=="?"]="left_low"
#Replace NA's values in left_low columns:
BreastCancer$breast_quad=ifelse(is.na(BreastCancer$breast_quad),ave(BreastCancer$breast_quad,FUN=function(x)"left_low"),BreastCancer$breast_quad)
#Check the levels of any any column
factor (BreastCancer$irradiat)
## [1] no no no no no no no no no no no no no no no no no
## [18] no no no no no no no no no no no no no no no no no
## [35] no no no no no no no no no no no no no no no no no
## [52] no no no no no no no no no no no no no no no no no
## [69] no no no no no no no no no no no no no no no no no
## [86] no no no no no no no no no no no no no no no no no
## [103] no no no no no no no no no no no no no no no no no
## [120] no no no no no no no no yes no yes yes yes no no yes no
## [137] yes yes yes no no no no yes no yes yes yes no no no no no
## [154] yes no yes no no no no yes no yes yes yes no no yes no yes
## [171] yes no no no yes yes no no yes yes yes yes yes yes yes yes no
## [188] yes no no yes yes no no yes no no yes yes no no no no no
## [205] no no no no no no no no no no no no no no no no no
## [222] no yes no yes yes no yes no yes no no yes yes yes yes no no
## [239] no yes yes no yes no yes yes yes yes no no no yes no no no
## [256] no yes yes yes no no yes no no no no no yes no yes no yes
## [273] no yes yes no yes yes no yes yes no yes no no no
## Levels: no yes
#Labeling the features
BreastCancer$output= factor (BreastCancer$output, labels= c(0,1) , levels= c("no-recurrence-events", "recurrence-events"))
BreastCancer$age= factor (BreastCancer$age,labels= c(0,1,2,3,4,5) , levels= c("20-29","30-39","40-49","50-59","60-69","70-79"))
BreastCancer$menopause= factor (BreastCancer$menopause, labels= c(0,1,2) , levels= c("premeno","ge40","lt40"))
BreastCancer$tumor_size= factor (BreastCancer$tumor_size, labels= c(0,1,2,3,4,5,6,7,8,9,10) , levels= c("0-4", "10-14", "15-19", "20-24","25-29", "30-34", "35-39","40-44","45-49","5-9","50-54"))
BreastCancer$inv_nodes= factor (BreastCancer$inv_nodes, labels= c(0,1,2,3,4,5,6) , levels= c("0-2", "12-14", "15-17", "24-26", "3-5", "6-8", "9-11"))
BreastCancer$node_caps= factor (BreastCancer$node_caps, labels= c(0,1) , levels= c("2", "3"))
BreastCancer$deg_malig= factor (BreastCancer$deg_malig, labels= c(0,1,2) , levels= c("1", "2", "3"))
BreastCancer$breast= factor (BreastCancer$breast, labels= c(0,1) , levels= c("left", "right"))
BreastCancer$breast_quad= factor (BreastCancer$breast_quad, labels= c(0,1,2,3,4) , levels= c("2", "3", "4", "5", "6"))
BreastCancer$irradiat= factor (BreastCancer$irradiat, labels= c(0,1) , levels= c("yes", "no"))
#Factorizing all features
FeaturesNames = list ("output","age","menopause","tumor_size","inv_nodes","node_caps", "deg_malig","breast","breast_quad","irradiat")
for(i in FeaturesNames){
BreastCancer[i] <- sapply (BreastCancer[i], as.factor)
}
#Split the data to input-output
y= select(BreastCancer,c(1))
x= select(BreastCancer,-c(output))
#Split the data to train and test data for more analysis and modeling.
sample = sample.split(BreastCancer,SplitRatio = .75)
Train = subset (BreastCancer, sample ==TRUE)
Test = subset (BreastCancer, sample ==FALSE)
ggplot(data=BreastCancer,aes(x=age,fill=irradiat,))+geom_bar()
The graph above shows that ages between 20-29 did not expose to irriadiation, ages between 70-79 expose just little. But the majority of ages between 30 to 69 shows more exposure to irradiation.
ggplot(BreastCancer,aes(x= age,fill= output)) +
theme_bw() +
facet_wrap(~menopause) +
geom_bar()+
labs(y ="Freq",
title = "The Irradiat distribution by Age based on Menopause")
The graph above shows that distribution of irradiat based on age and menopause.
Conclusion
Data cleaning and processing is done. Since this data is classification, decision Tree can help us to visualize decision rules for predicting a categorical (classification tree).