DATA 607 Assignment 1

Gehad Gad

February 2nd, 2020

Assignment 1

Introduction This data is about Breast Cancer findings, obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This data set includes 201 instances of one class and 85 instances of another class. The instances are described by 9 attributes, some of which are linear and some are nominal.

#Import libraries and/or Packages
if (!require(dplyr)){
install.packages("dplyr")
library(dplyr)}

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.6.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

if (!require(caTools)){
install.packages("caTools")
library(caTools)}

## Loading required package: caTools

## Warning: package 'caTools' was built under R version 3.6.2

if (!require(ggplot2)){
install.packages("ggplot2")
library(ggplot2)}

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.6.2

Read the data into R.Name the features of the data.

BreastCancer <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", header=FALSE)

names (BreastCancer)<-c("output","age","menopause","tumor_size","inv_nodes","node_caps", "deg_malig","breast","breast_quad","irradiat")

Display few rows from the data.

head(BreastCancer)

##                 output   age menopause tumor_size inv_nodes node_caps
## 1 no-recurrence-events 30-39   premeno      30-34       0-2        no
## 2 no-recurrence-events 40-49   premeno      20-24       0-2        no
## 3 no-recurrence-events 40-49   premeno      20-24       0-2        no
## 4 no-recurrence-events 60-69      ge40      15-19       0-2        no
## 5 no-recurrence-events 40-49   premeno        0-4       0-2        no
## 6 no-recurrence-events 60-69      ge40      15-19       0-2        no
##   deg_malig breast breast_quad irradiat
## 1         3   left    left_low       no
## 2         2  right    right_up       no
## 3         2   left    left_low       no
## 4         2  right     left_up       no
## 5         2  right   right_low       no
## 6         2   left    left_low       no

#Display the head of the data.

summary(BreastCancer)

##                   output       age       menopause     tumor_size
##  no-recurrence-events:201   20-29: 1   ge40   :129   30-34  :60  
##  recurrence-events   : 85   30-39:36   lt40   :  7   25-29  :54  
##                             40-49:90   premeno:150   20-24  :50  
##                             50-59:96                 15-19  :30  
##                             60-69:57                 10-14  :28  
##                             70-79: 6                 40-44  :22  
##                                                      (Other):42  
##  inv_nodes   node_caps   deg_malig       breast       breast_quad 
##  0-2  :213   ?  :  8   Min.   :1.000   left :152   ?        :  1  
##  12-14:  3   no :222   1st Qu.:2.000   right:134   central  : 21  
##  15-17:  6   yes: 56   Median :2.000               left_low :110  
##  24-26:  1             Mean   :2.049               left_up  : 97  
##  3-5  : 36             3rd Qu.:3.000               right_low: 24  
##  6-8  : 17             Max.   :3.000               right_up : 33  
##  9-11 : 10                                                        
##  irradiat 
##  no :218  
##  yes: 68  
##           
##           
##           
##           
##

#check for NA values in the data.

sum(is.na(BreastCancer))

## [1] 0

Also, there is not NA showing, there are some ? in the date appears in the summary view.

#Replace ? values in node_caps columns:

BreastCancer$node_caps[BreastCancer$node_caps=="?"]="yes"

#Replace any NA's values in node_caps

BreastCancer$node_caps=ifelse(is.na(BreastCancer$node_caps),ave(BreastCancer$node_caps,FUN=function(x)"no"),BreastCancer$node_caps)

#Replace any ? values in breast_quad column.

BreastCancer$breast_quad[BreastCancer$breast_quad=="?"]="left_low"

#Replace NA's values in left_low columns:

BreastCancer$breast_quad=ifelse(is.na(BreastCancer$breast_quad),ave(BreastCancer$breast_quad,FUN=function(x)"left_low"),BreastCancer$breast_quad)

#Check the levels of any any column

factor (BreastCancer$irradiat)

##   [1] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [18] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [35] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [52] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [69] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
##  [86] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
## [103] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
## [120] no  no  no  no  no  no  no  no  yes no  yes yes yes no  no  yes no 
## [137] yes yes yes no  no  no  no  yes no  yes yes yes no  no  no  no  no 
## [154] yes no  yes no  no  no  no  yes no  yes yes yes no  no  yes no  yes
## [171] yes no  no  no  yes yes no  no  yes yes yes yes yes yes yes yes no 
## [188] yes no  no  yes yes no  no  yes no  no  yes yes no  no  no  no  no 
## [205] no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no  no 
## [222] no  yes no  yes yes no  yes no  yes no  no  yes yes yes yes no  no 
## [239] no  yes yes no  yes no  yes yes yes yes no  no  no  yes no  no  no 
## [256] no  yes yes yes no  no  yes no  no  no  no  no  yes no  yes no  yes
## [273] no  yes yes no  yes yes no  yes yes no  yes no  no  no 
## Levels: no yes

#Labeling the features


BreastCancer$output= factor (BreastCancer$output, labels= c(0,1) , levels= c("no-recurrence-events", "recurrence-events"))

BreastCancer$age= factor (BreastCancer$age,labels= c(0,1,2,3,4,5) , levels= c("20-29","30-39","40-49","50-59","60-69","70-79")) 

BreastCancer$menopause= factor (BreastCancer$menopause, labels= c(0,1,2) , levels= c("premeno","ge40","lt40")) 

BreastCancer$tumor_size= factor (BreastCancer$tumor_size, labels= c(0,1,2,3,4,5,6,7,8,9,10) , levels= c("0-4", "10-14", "15-19", "20-24","25-29", "30-34", "35-39","40-44","45-49","5-9","50-54")) 

BreastCancer$inv_nodes= factor (BreastCancer$inv_nodes, labels= c(0,1,2,3,4,5,6) , levels= c("0-2", "12-14", "15-17", "24-26", "3-5", "6-8", "9-11"))


BreastCancer$node_caps= factor (BreastCancer$node_caps, labels= c(0,1) , levels= c("2", "3"))

BreastCancer$deg_malig= factor (BreastCancer$deg_malig, labels= c(0,1,2) , levels= c("1", "2", "3"))


BreastCancer$breast= factor (BreastCancer$breast, labels= c(0,1) , levels= c("left", "right")) 

BreastCancer$breast_quad= factor (BreastCancer$breast_quad, labels= c(0,1,2,3,4) , levels= c("2", "3", "4", "5", "6")) 

BreastCancer$irradiat= factor (BreastCancer$irradiat, labels= c(0,1) , levels= c("yes", "no"))

#Factorizing all features

FeaturesNames = list ("output","age","menopause","tumor_size","inv_nodes","node_caps", "deg_malig","breast","breast_quad","irradiat")

for(i in FeaturesNames){
  BreastCancer[i] <- sapply (BreastCancer[i], as.factor)
  
}

#Split the data to input-output

y= select(BreastCancer,c(1))

x= select(BreastCancer,-c(output))

#Split the data to train and test data for more analysis and modeling.

sample = sample.split(BreastCancer,SplitRatio = .75)
Train = subset (BreastCancer, sample ==TRUE) 
Test =  subset (BreastCancer, sample ==FALSE)

ggplot(data=BreastCancer,aes(x=age,fill=irradiat,))+geom_bar()

The graph above shows that ages between 20-29 did not expose to irriadiation, ages between 70-79 expose just little. But the majority of ages between 30 to 69 shows more exposure to irradiation.

ggplot(BreastCancer,aes(x= age,fill= output)) +
  theme_bw() +
  facet_wrap(~menopause) +
  geom_bar()+
  labs(y ="Freq",
       title = "The Irradiat distribution by Age based on Menopause")

The graph above shows that distribution of irradiat based on age and menopause.

Conclusion

Data cleaning and processing is done. Since this data is classification, decision Tree can help us to visualize decision rules for predicting a categorical (classification tree).