Source of the Data: https://archive.ics.uci.edu/ml/machine-learning-databases/00192/

Reading the Data

First of oll, we are going to read the .csv file from the Dataset with electrical impedance measurements in samples of freshly excised tissue from the breast.

#Reading the .csv file "breast_tissue"
breast_tissue <- read.csv("breast_tissue.csv")

Since we are assigning to use the dyplyr package, then we will install and load it.

#Installing the dplyr packages
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Dimensions and Structure of the Dataset

Now, we are going to get the dimensions and variables of the data with the functions dim()and str(). We have 106 instances and 11 variables.

dim(breast_tissue)

## [1] 106  10

str(breast_tissue)

## 'data.frame':    106 obs. of  10 variables:
##  $ Class : Factor w/ 6 levels "adi","car","con",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ I0    : num  525 330 552 380 363 ...
##  $ PA500 : num  0.187 0.227 0.232 0.241 0.201 ...
##  $ HFS   : num  0.0321 0.2653 0.0635 0.2862 0.2443 ...
##  $ DA    : num  229 121 265 138 125 ...
##  $ Area  : num  6844 3163 11888 5402 3290 ...
##  $ A.DA  : num  29.9 26.1 44.9 39.2 26.3 ...
##  $ Max.IP: num  60.2 69.7 77.8 88.8 69.4 ...
##  $ DR    : num  220.7 99.1 253.8 105.2 103.9 ...
##  $ P     : num  557 400 657 494 425 ...

Head and Tail

At this step, we will look at the head and the tail of the data set to see by defaut its first and last 6 variables.

head(breast_tissue)

##   Class       I0     PA500        HFS       DA      Area     A.DA   Max.IP
## 1   car 524.7941 0.1874484 0.03211406 228.8002  6843.598 29.91080 60.20488
## 2   car 330.0000 0.2268928 0.26529005 121.1542  3163.239 26.10920 69.71736
## 3   car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490 77.79330
## 4   car 380.0000 0.2408554 0.28623400 137.6401  5402.171 39.24852 88.75845
## 5   car 362.8313 0.2007129 0.24434610 124.9126  3290.462 26.34213 69.38939
## 6   car 389.8730 0.1500983 0.09773844 118.6258  2475.557 20.86862 49.75715
##          DR        P
## 1 220.73721 556.8283
## 2  99.08496 400.2258
## 3 253.78530 656.7694
## 4 105.19857 493.7018
## 5 103.86655 424.7965
## 6 107.68616 429.3858

tail(breast_tissue)

##     Class   I0      PA500         HFS        DA       Area      A.DA
## 101   adi 2000 0.06719518  0.12426744  330.2716  15381.098  46.57105
## 102   adi 2000 0.10698868  0.10541789  520.2226  40087.921  77.05916
## 103   adi 2600 0.20053833  0.20804325 1063.4414 174480.476 164.07154
## 104   adi 1600 0.07190757 -0.06632251  436.9436  12655.342  28.96333
## 105   adi 2300 0.04502950  0.13683381  185.4460   5086.292  27.42734
## 106   adi 2600 0.06998770  0.04886922  745.4744  39845.774  53.45023
##       Max.IP        DR        P
## 101 169.1980 283.63956 2063.073
## 102 204.0903 478.51722 2088.649
## 103 418.6873 977.55237 2664.584
## 104 103.7327 432.12975 1475.372
## 105 178.6917  49.59329 2480.592
## 106 154.1226 729.36839 2545.420

Removing NA Values

Now we are going to remove the rows where there are NA values.

good <- complete.cases(breast_tissue)
head(breast_tissue[good,])

##   Class       I0     PA500        HFS       DA      Area     A.DA   Max.IP
## 1   car 524.7941 0.1874484 0.03211406 228.8002  6843.598 29.91080 60.20488
## 2   car 330.0000 0.2268928 0.26529005 121.1542  3163.239 26.10920 69.71736
## 3   car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490 77.79330
## 4   car 380.0000 0.2408554 0.28623400 137.6401  5402.171 39.24852 88.75845
## 5   car 362.8313 0.2007129 0.24434610 124.9126  3290.462 26.34213 69.38939
## 6   car 389.8730 0.1500983 0.09773844 118.6258  2475.557 20.86862 49.75715
##          DR        P
## 1 220.73721 556.8283
## 2  99.08496 400.2258
## 3 253.78530 656.7694
## 4 105.19857 493.7018
## 5 103.86655 424.7965
## 6 107.68616 429.3858

Select Function

By using the select funtion,we will select only the columns Class all the way to A.DA

subset_1<-select(breast_tissue, Class:A.DA)
head(subset_1)

##   Class       I0     PA500        HFS       DA      Area     A.DA
## 1   car 524.7941 0.1874484 0.03211406 228.8002  6843.598 29.91080
## 2   car 330.0000 0.2268928 0.26529005 121.1542  3163.239 26.10920
## 3   car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490
## 4   car 380.0000 0.2408554 0.28623400 137.6401  5402.171 39.24852
## 5   car 362.8313 0.2007129 0.24434610 124.9126  3290.462 26.34213
## 6   car 389.8730 0.1500983 0.09773844 118.6258  2475.557 20.86862

Omit Columns

Using the subset and - function can help to omit the columns A.DA-P.

subset_2<-select(breast_tissue, -(A.DA:P))
head(subset_2)

##   Class       I0     PA500        HFS       DA      Area
## 1   car 524.7941 0.1874484 0.03211406 228.8002  6843.598
## 2   car 330.0000 0.2268928 0.26529005 121.1542  3163.239
## 3   car 551.8793 0.2324779 0.06352998 264.8049 11888.392
## 4   car 380.0000 0.2408554 0.28623400 137.6401  5402.171
## 5   car 362.8313 0.2007129 0.24434610 124.9126  3290.462
## 6   car 389.8730 0.1500983 0.09773844 118.6258  2475.557

select Using Patterns

By using the starts_with, we can choose only the variables that start with D.

subset_3<- select(breast_tissue, starts_with("D")) 
head(subset_3)

##         DA        DR
## 1 228.8002 220.73721
## 2 121.1542  99.08496
## 3 264.8049 253.78530
## 4 137.6401 105.19857
## 5 124.9126 103.86655
## 6 118.6258 107.68616

Filter Function

At this time, we are going to use the filter function so we can filter out only the instance that have the Impedivity (ohm) at zero frequency more than 300.

breast_tissue.f <- filter(breast_tissue, I0 > 300) 
head(breast_tissue.f$I0)

## [1] 524.7941 330.0000 551.8793 380.0000 362.8313 389.8730

summary(breast_tissue.f$I0)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   301.3   400.5   692.0  1116.4  1800.0  2800.0

Arrange Function

By using the arrange function, we were able to arrange the instance in ascending order by maximum of the spectrum.

 breast_tissue <- arrange(breast_tissue, Max.IP)
head(select(breast_tissue,Max.IP),10)

##       Max.IP
## 1   7.968783
## 2   9.102176
## 3   9.991348
## 4  10.675764
## 5  14.268374
## 6  17.506838
## 7  17.776981
## 8  17.868670
## 9  18.131014
## 10 18.226492

tail(select(breast_tissue,Max.IP),10)

##       Max.IP
## 97  178.6917
## 98  204.0903
## 99  208.7400
## 100 217.8340
## 101 261.3482
## 102 289.5691
## 103 298.5830
## 104 336.0752
## 105 418.6873
## 106 436.0996

Rename Function

Now we are going to use the rename function to rename the variable Max.IP into maximumIP.

breast_tissue<- rename(breast_tissue, maximumIP= Max.IP)

Mutate

At this section, we are using the mutate() function to compute transformations of variables in a data frame. We need to create HFSdetrend variable that subtracts the mean from the HFS variable.

breast_tissue<- mutate(breast_tissue, HFSdetrend = HFS - mean(HFS, na.rm = TRUE))
head(select(breast_tissue,HFSdetrend),5)

##     HFSdetrend
## 1 -0.023933668
## 2 -0.100728155
## 3 -0.047670146
## 4  0.002595337
## 5 -0.020443009

Project 1_Data Cleaning

Honore Nguessan

6/15/2019