Source of the Data: https://archive.ics.uci.edu/ml/machine-learning-databases/00192/
First of oll, we are going to read the .csv file from the Dataset with electrical impedance measurements in samples of freshly excised tissue from the breast.
#Reading the .csv file "breast_tissue"
breast_tissue <- read.csv("breast_tissue.csv")
Since we are assigning to use the dyplyr package, then we will install and load it.
#Installing the dplyr packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Now, we are going to get the dimensions and variables of the data with the functions dim()and str(). We have 106 instances and 11 variables.
dim(breast_tissue)
## [1] 106 10
str(breast_tissue)
## 'data.frame': 106 obs. of 10 variables:
## $ Class : Factor w/ 6 levels "adi","car","con",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ I0 : num 525 330 552 380 363 ...
## $ PA500 : num 0.187 0.227 0.232 0.241 0.201 ...
## $ HFS : num 0.0321 0.2653 0.0635 0.2862 0.2443 ...
## $ DA : num 229 121 265 138 125 ...
## $ Area : num 6844 3163 11888 5402 3290 ...
## $ A.DA : num 29.9 26.1 44.9 39.2 26.3 ...
## $ Max.IP: num 60.2 69.7 77.8 88.8 69.4 ...
## $ DR : num 220.7 99.1 253.8 105.2 103.9 ...
## $ P : num 557 400 657 494 425 ...
At this step, we will look at the head and the tail of the data set to see by defaut its first and last 6 variables.
head(breast_tissue)
## Class I0 PA500 HFS DA Area A.DA Max.IP
## 1 car 524.7941 0.1874484 0.03211406 228.8002 6843.598 29.91080 60.20488
## 2 car 330.0000 0.2268928 0.26529005 121.1542 3163.239 26.10920 69.71736
## 3 car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490 77.79330
## 4 car 380.0000 0.2408554 0.28623400 137.6401 5402.171 39.24852 88.75845
## 5 car 362.8313 0.2007129 0.24434610 124.9126 3290.462 26.34213 69.38939
## 6 car 389.8730 0.1500983 0.09773844 118.6258 2475.557 20.86862 49.75715
## DR P
## 1 220.73721 556.8283
## 2 99.08496 400.2258
## 3 253.78530 656.7694
## 4 105.19857 493.7018
## 5 103.86655 424.7965
## 6 107.68616 429.3858
tail(breast_tissue)
## Class I0 PA500 HFS DA Area A.DA
## 101 adi 2000 0.06719518 0.12426744 330.2716 15381.098 46.57105
## 102 adi 2000 0.10698868 0.10541789 520.2226 40087.921 77.05916
## 103 adi 2600 0.20053833 0.20804325 1063.4414 174480.476 164.07154
## 104 adi 1600 0.07190757 -0.06632251 436.9436 12655.342 28.96333
## 105 adi 2300 0.04502950 0.13683381 185.4460 5086.292 27.42734
## 106 adi 2600 0.06998770 0.04886922 745.4744 39845.774 53.45023
## Max.IP DR P
## 101 169.1980 283.63956 2063.073
## 102 204.0903 478.51722 2088.649
## 103 418.6873 977.55237 2664.584
## 104 103.7327 432.12975 1475.372
## 105 178.6917 49.59329 2480.592
## 106 154.1226 729.36839 2545.420
Now we are going to remove the rows where there are NA values.
good <- complete.cases(breast_tissue)
head(breast_tissue[good,])
## Class I0 PA500 HFS DA Area A.DA Max.IP
## 1 car 524.7941 0.1874484 0.03211406 228.8002 6843.598 29.91080 60.20488
## 2 car 330.0000 0.2268928 0.26529005 121.1542 3163.239 26.10920 69.71736
## 3 car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490 77.79330
## 4 car 380.0000 0.2408554 0.28623400 137.6401 5402.171 39.24852 88.75845
## 5 car 362.8313 0.2007129 0.24434610 124.9126 3290.462 26.34213 69.38939
## 6 car 389.8730 0.1500983 0.09773844 118.6258 2475.557 20.86862 49.75715
## DR P
## 1 220.73721 556.8283
## 2 99.08496 400.2258
## 3 253.78530 656.7694
## 4 105.19857 493.7018
## 5 103.86655 424.7965
## 6 107.68616 429.3858
By using the select funtion,we will select only the columns Class all the way to A.DA
subset_1<-select(breast_tissue, Class:A.DA)
head(subset_1)
## Class I0 PA500 HFS DA Area A.DA
## 1 car 524.7941 0.1874484 0.03211406 228.8002 6843.598 29.91080
## 2 car 330.0000 0.2268928 0.26529005 121.1542 3163.239 26.10920
## 3 car 551.8793 0.2324779 0.06352998 264.8049 11888.392 44.89490
## 4 car 380.0000 0.2408554 0.28623400 137.6401 5402.171 39.24852
## 5 car 362.8313 0.2007129 0.24434610 124.9126 3290.462 26.34213
## 6 car 389.8730 0.1500983 0.09773844 118.6258 2475.557 20.86862
Using the subset and - function can help to omit the columns A.DA-P.
subset_2<-select(breast_tissue, -(A.DA:P))
head(subset_2)
## Class I0 PA500 HFS DA Area
## 1 car 524.7941 0.1874484 0.03211406 228.8002 6843.598
## 2 car 330.0000 0.2268928 0.26529005 121.1542 3163.239
## 3 car 551.8793 0.2324779 0.06352998 264.8049 11888.392
## 4 car 380.0000 0.2408554 0.28623400 137.6401 5402.171
## 5 car 362.8313 0.2007129 0.24434610 124.9126 3290.462
## 6 car 389.8730 0.1500983 0.09773844 118.6258 2475.557
By using the starts_with, we can choose only the variables that start with D.
subset_3<- select(breast_tissue, starts_with("D"))
head(subset_3)
## DA DR
## 1 228.8002 220.73721
## 2 121.1542 99.08496
## 3 264.8049 253.78530
## 4 137.6401 105.19857
## 5 124.9126 103.86655
## 6 118.6258 107.68616
At this time, we are going to use the filter function so we can filter out only the instance that have the Impedivity (ohm) at zero frequency more than 300.
breast_tissue.f <- filter(breast_tissue, I0 > 300)
head(breast_tissue.f$I0)
## [1] 524.7941 330.0000 551.8793 380.0000 362.8313 389.8730
summary(breast_tissue.f$I0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 301.3 400.5 692.0 1116.4 1800.0 2800.0
By using the arrange function, we were able to arrange the instance in ascending order by maximum of the spectrum.
breast_tissue <- arrange(breast_tissue, Max.IP)
head(select(breast_tissue,Max.IP),10)
## Max.IP
## 1 7.968783
## 2 9.102176
## 3 9.991348
## 4 10.675764
## 5 14.268374
## 6 17.506838
## 7 17.776981
## 8 17.868670
## 9 18.131014
## 10 18.226492
tail(select(breast_tissue,Max.IP),10)
## Max.IP
## 97 178.6917
## 98 204.0903
## 99 208.7400
## 100 217.8340
## 101 261.3482
## 102 289.5691
## 103 298.5830
## 104 336.0752
## 105 418.6873
## 106 436.0996
Now we are going to use the rename function to rename the variable Max.IP into maximumIP.
breast_tissue<- rename(breast_tissue, maximumIP= Max.IP)
At this section, we are using the mutate() function to compute transformations of variables in a data frame. We need to create HFSdetrend variable that subtracts the mean from the HFS variable.
breast_tissue<- mutate(breast_tissue, HFSdetrend = HFS - mean(HFS, na.rm = TRUE))
head(select(breast_tissue,HFSdetrend),5)
## HFSdetrend
## 1 -0.023933668
## 2 -0.100728155
## 3 -0.047670146
## 4 0.002595337
## 5 -0.020443009