library(mlbench)
data("PimaIndiansDiabetes")
dim(PimaIndiansDiabetes)
## [1] 768   9
str(PimaIndiansDiabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes[, 1:8])
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00
print("Saving original dataset into another variable to modify it")
## [1] "Saving original dataset into another variable to modify it"
PM<-PimaIndiansDiabetes
# Removing 0 from glucose, pressure, triceps and replacing 
PM$glucose[PM$glucose==0]<-NA
print("Glucose data with 0 values removed")
## [1] "Glucose data with 0 values removed"
summary(PM$glucose)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    44.0    99.0   117.0   121.7   141.0   199.0       5
PM$pressure[PM$pressure==0]<-NA
print("Pressure data with 0 values removed")
## [1] "Pressure data with 0 values removed"
summary(PM$pressure)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   24.00   64.00   72.00   72.41   80.00  122.00      35
PM$triceps[PM$triceps==0]<-NA
print("Triceps data with 0 values removed")
## [1] "Triceps data with 0 values removed"
summary(PM$triceps)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    7.00   22.00   29.00   29.15   36.00   99.00     227
table(PM$diabetes)
## 
## neg pos 
## 500 268
round(prop.table(table(PM$diabetes)),2)
## 
##  neg  pos 
## 0.65 0.35
print ("Iris Data")
## [1] "Iris Data"
data("iris")
data(iris)
dim(iris)
## [1] 150   5
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
round(prop.table(table(iris$Species)),2)
## 
##     setosa versicolor  virginica 
##       0.33       0.33       0.33