library(mlbench)
data("PimaIndiansDiabetes")
dim(PimaIndiansDiabetes)
## [1] 768 9
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes[, 1:8])
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
print("Saving original dataset into another variable to modify it")
## [1] "Saving original dataset into another variable to modify it"
PM<-PimaIndiansDiabetes
# Removing 0 from glucose, pressure, triceps and replacing
PM$glucose[PM$glucose==0]<-NA
print("Glucose data with 0 values removed")
## [1] "Glucose data with 0 values removed"
summary(PM$glucose)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.0 99.0 117.0 121.7 141.0 199.0 5
PM$pressure[PM$pressure==0]<-NA
print("Pressure data with 0 values removed")
## [1] "Pressure data with 0 values removed"
summary(PM$pressure)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 24.00 64.00 72.00 72.41 80.00 122.00 35
PM$triceps[PM$triceps==0]<-NA
print("Triceps data with 0 values removed")
## [1] "Triceps data with 0 values removed"
summary(PM$triceps)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 7.00 22.00 29.00 29.15 36.00 99.00 227
table(PM$diabetes)
##
## neg pos
## 500 268
round(prop.table(table(PM$diabetes)),2)
##
## neg pos
## 0.65 0.35
print ("Iris Data")
## [1] "Iris Data"
data("iris")
data(iris)
dim(iris)
## [1] 150 5
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris[, 1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
round(prop.table(table(iris$Species)),2)
##
## setosa versicolor virginica
## 0.33 0.33 0.33