Datasets Processing

filename<-"C:/Users/18324/Documents/AI4OPT/R stats/iris.csv"
dataset<-read.csv(filename, header= FALSE)
#preview the first 5 rows
head(dataset)

##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

library(RCurl)
url<-"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
#download the file
downloaded<-getURL(url,ssl.verifypeer=FALSE)
connection<-textConnection(downloaded)
dataset<-read.csv(connection, header=FALSE)
#Preview first 5 rows
head(dataset)

##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

data(iris)
par(mfrow=c(1,4))
for(i in 1:4){
  hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(mlbench)
data("Soybean")
missmap(Soybean, col=c("black", "grey"),legend=FALSE)
# MULTIVARIATE VISUALIZATION
library(corrplot)

## corrplot 0.92 loaded

data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)

pairs(Species~., data=iris, col=iris$Species)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")

#Section Peek At your Data
library(mlbench)
data("PimaIndiansDiabetes")
head(PimaIndiansDiabetes, n=20)

##    pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1         6     148       72      35       0 33.6    0.627  50      pos
## 2         1      85       66      29       0 26.6    0.351  31      neg
## 3         8     183       64       0       0 23.3    0.672  32      pos
## 4         1      89       66      23      94 28.1    0.167  21      neg
## 5         0     137       40      35     168 43.1    2.288  33      pos
## 6         5     116       74       0       0 25.6    0.201  30      neg
## 7         3      78       50      32      88 31.0    0.248  26      pos
## 8        10     115        0       0       0 35.3    0.134  29      neg
## 9         2     197       70      45     543 30.5    0.158  53      pos
## 10        8     125       96       0       0  0.0    0.232  54      pos
## 11        4     110       92       0       0 37.6    0.191  30      neg
## 12       10     168       74       0       0 38.0    0.537  34      pos
## 13       10     139       80       0       0 27.1    1.441  57      neg
## 14        1     189       60      23     846 30.1    0.398  59      pos
## 15        5     166       72      19     175 25.8    0.587  51      pos
## 16        7     100        0       0       0 30.0    0.484  32      pos
## 17        0     118       84      47     230 45.8    0.551  31      pos
## 18        7     107       74       0       0 29.6    0.254  31      pos
## 19        1     103       30      38      83 43.3    0.183  33      neg
## 20        1     115       70      30      96 34.6    0.529  32      pos

dim(PimaIndiansDiabetes)

## [1] 768   9

data("BostonHousing")
sapply(BostonHousing, class)

##      crim        zn     indus      chas       nox        rm       age       dis 
## "numeric" "numeric" "numeric"  "factor" "numeric" "numeric" "numeric" "numeric" 
##       rad       tax   ptratio         b     lstat      medv 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"

#Distribution of class variables
y<-PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y)*100))

##     freq percentage
## neg  500  0.6510417
## pos  268  0.3489583

summary(PimaIndiansDiabetes$glucose)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   117.0   120.9   140.2   199.0

#Chapter 9
library(caret)
data("iris")
print("Scale Data")

## [1] "Scale Data"

summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

preProcessParams<-preProcess(iris[,1:4], method=c("scale"))
print(preProcessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (4)

transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length    Sepal.Width      Petal.Length     Petal.Width    
##  Min.   :5.193   Min.   : 4.589   Min.   :0.5665   Min.   :0.1312  
##  1st Qu.:6.159   1st Qu.: 6.424   1st Qu.:0.9064   1st Qu.:0.3936  
##  Median :7.004   Median : 6.883   Median :2.4642   Median :1.7055  
##  Mean   :7.057   Mean   : 7.014   Mean   :2.1288   Mean   :1.5734  
##  3rd Qu.:7.729   3rd Qu.: 7.571   3rd Qu.:2.8890   3rd Qu.:2.3615  
##  Max.   :9.540   Max.   :10.095   Max.   :3.9087   Max.   :3.2798

#Checking on one attribute Sepal.Length
ISd<-sd(iris$Sepal.Length)
ISd

## [1] 0.8280661

min(iris$Sepal.Length)/ISd

## [1] 5.192822

print("Center Data")

## [1] "Center Data"

summary(iris[, 1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

preProcessParams<-preProcess(iris[,1:4], method=c("center"))
print(preProcessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)

transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length       Sepal.Width        Petal.Length     Petal.Width     
##  Min.   :-1.54333   Min.   :-1.05733   Min.   :-2.758   Min.   :-1.0993  
##  1st Qu.:-0.74333   1st Qu.:-0.25733   1st Qu.:-2.158   1st Qu.:-0.8993  
##  Median :-0.04333   Median :-0.05733   Median : 0.592   Median : 0.1007  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.: 0.55667   3rd Qu.: 0.24267   3rd Qu.: 1.342   3rd Qu.: 0.6007  
##  Max.   : 2.05667   Max.   : 1.34267   Max.   : 3.142   Max.   : 1.3007

print("Standardize data")

## [1] "Standardize data"

summary(iris[,1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

preProcessParams<-preProcess(iris[,1:4], method=c("center", "scale"))
print(preProcessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - scaled (4)

transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

print("Normalize Data")

## [1] "Normalize Data"

preProcessParams<-preProcess(iris[,1:4], method=c("range"))
print(preProcessParams)

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (4)

transformed<-predict(preProcessParams, iris[,1:4])
summary(transformed)

##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.2222   1st Qu.:0.3333   1st Qu.:0.1017   1st Qu.:0.08333  
##  Median :0.4167   Median :0.4167   Median :0.5678   Median :0.50000  
##  Mean   :0.4287   Mean   :0.4406   Mean   :0.4675   Mean   :0.45806  
##  3rd Qu.:0.5833   3rd Qu.:0.5417   3rd Qu.:0.6949   3rd Qu.:0.70833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000

print("Box-Cox Transform")

## [1] "Box-Cox Transform"

library(mlbench)
library(caret)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

preProcessParams<-preProcess(PimaIndiansDiabetes[,7:8], method=c("BoxCox"))
print(preProcessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - Box-Cox transformation (2)
##   - ignored (0)
## 
## Lambda estimates for Box-Cox transformation:
## -0.1, -1.1

transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 7:8])
summary(transformed)

##     pedigree            age        
##  Min.   :-2.5510   Min.   :0.8772  
##  1st Qu.:-1.4116   1st Qu.:0.8815  
##  Median :-0.9875   Median :0.8867  
##  Mean   :-0.9599   Mean   :0.8874  
##  3rd Qu.:-0.4680   3rd Qu.:0.8938  
##  Max.   : 0.8838   Max.   :0.9019

print("Yeo Johnson Transform")

## [1] "Yeo Johnson Transform"

data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

preProcessParams<-preProcess(PimaIndiansDiabetes[,7:8], method=c("YeoJohnson"))
print(preProcessParams)

## Created from 768 samples and 2 variables
## 
## Pre-processing:
##   - ignored (0)
##   - Yeo-Johnson transformation (2)
## 
## Lambda estimates for Yeo-Johnson transformation:
## -2.25, -1.15

transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 7:8])
summary(transformed)

##     pedigree           age        
##  Min.   :0.0691   Min.   :0.8450  
##  1st Qu.:0.1724   1st Qu.:0.8484  
##  Median :0.2265   Median :0.8524  
##  Mean   :0.2317   Mean   :0.8530  
##  3rd Qu.:0.2956   3rd Qu.:0.8580  
##  Max.   :0.4164   Max.   :0.8644

print("Principal Component Analysis Transform")

## [1] "Principal Component Analysis Transform"

data(iris)
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

preProcessParams<-preProcess(iris, method=c("center","scale","pca"))
print(preProcessParams)

## Created from 150 samples and 5 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (1)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance

transformed<-predict(preProcessParams, iris)
summary(transformed)

##        Species        PC1               PC2          
##  setosa    :50   Min.   :-2.7651   Min.   :-2.67732  
##  versicolor:50   1st Qu.:-2.0957   1st Qu.:-0.59205  
##  virginica :50   Median : 0.4169   Median :-0.01744  
##                  Mean   : 0.0000   Mean   : 0.00000  
##                  3rd Qu.: 1.3385   3rd Qu.: 0.59649  
##                  Max.   : 3.2996   Max.   : 2.64521

print("Independent Component Analysis Transform")

## [1] "Independent Component Analysis Transform"

library(fastICA)
data("PimaIndiansDiabetes")
summary(PimaIndiansDiabetes[, 7:8])

##     pedigree           age       
##  Min.   :0.0780   Min.   :21.00  
##  1st Qu.:0.2437   1st Qu.:24.00  
##  Median :0.3725   Median :29.00  
##  Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :2.4200   Max.   :81.00

preProcessParams<-preProcess(PimaIndiansDiabetes[,1:8], method=c("center", "scale","ica"), n.comp=5)
print(preProcessParams)

## Created from 768 samples and 8 variables
## 
## Pre-processing:
##   - centered (8)
##   - independent component signal extraction (8)
##   - ignored (0)
##   - scaled (8)
## 
## ICA used 5 components

transformed<-predict(preProcessParams, PimaIndiansDiabetes[, 1:8])
summary(transformed)

##       ICA1               ICA2               ICA3              ICA4        
##  Min.   :-2.38087   Min.   :-4.89623   Min.   :-6.0212   Min.   :-1.4141  
##  1st Qu.:-0.73737   1st Qu.:-0.48395   1st Qu.:-0.4290   1st Qu.:-0.8414  
##  Median : 0.07183   Median : 0.02404   Median : 0.2595   Median :-0.2787  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.72242   3rd Qu.: 0.59486   3rd Qu.: 0.6822   3rd Qu.: 0.7691  
##  Max.   : 2.93767   Max.   : 4.17244   Max.   : 1.5749   Max.   : 3.0683  
##       ICA5        
##  Min.   :-5.5400  
##  1st Qu.:-0.4688  
##  Median : 0.1392  
##  Mean   : 0.0000  
##  3rd Qu.: 0.6491  
##  Max.   : 3.2184

Datasets Processing

LG

2022-06-28