Datasets Processing

filename<-"C:/Users/18324/Documents/AI4OPT/R stats/iris.csv"
dataset<-read.csv(filename, header= FALSE)
#preview the first 5 rows
head(dataset)

##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

library(RCurl)
url<-"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
#download the file
downloaded<-getURL(url,ssl.verifypeer=FALSE)
connection<-textConnection(downloaded)
dataset<-read.csv(connection, header=FALSE)
#Preview first 5 rows
head(dataset)

##    V1  V2  V3  V4          V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa

data(iris)
par(mfrow=c(1,4))
for(i in 1:4){
  hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(mlbench)
data("Soybean")
missmap(Soybean, col=c("black", "grey"),legend=FALSE)
# MULTIVARIATE VISUALIZATION
library(corrplot)

## corrplot 0.92 loaded

data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)

pairs(Species~., data=iris, col=iris$Species)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")

#Section Peek At your Data
library(mlbench)
data("PimaIndiansDiabetes")
head(PimaIndiansDiabetes, n=20)

##    pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1         6     148       72      35       0 33.6    0.627  50      pos
## 2         1      85       66      29       0 26.6    0.351  31      neg
## 3         8     183       64       0       0 23.3    0.672  32      pos
## 4         1      89       66      23      94 28.1    0.167  21      neg
## 5         0     137       40      35     168 43.1    2.288  33      pos
## 6         5     116       74       0       0 25.6    0.201  30      neg
## 7         3      78       50      32      88 31.0    0.248  26      pos
## 8        10     115        0       0       0 35.3    0.134  29      neg
## 9         2     197       70      45     543 30.5    0.158  53      pos
## 10        8     125       96       0       0  0.0    0.232  54      pos
## 11        4     110       92       0       0 37.6    0.191  30      neg
## 12       10     168       74       0       0 38.0    0.537  34      pos
## 13       10     139       80       0       0 27.1    1.441  57      neg
## 14        1     189       60      23     846 30.1    0.398  59      pos
## 15        5     166       72      19     175 25.8    0.587  51      pos
## 16        7     100        0       0       0 30.0    0.484  32      pos
## 17        0     118       84      47     230 45.8    0.551  31      pos
## 18        7     107       74       0       0 29.6    0.254  31      pos
## 19        1     103       30      38      83 43.3    0.183  33      neg
## 20        1     115       70      30      96 34.6    0.529  32      pos

dim(PimaIndiansDiabetes)

## [1] 768   9

data("BostonHousing")
sapply(BostonHousing, class)

##      crim        zn     indus      chas       nox        rm       age       dis 
## "numeric" "numeric" "numeric"  "factor" "numeric" "numeric" "numeric" "numeric" 
##       rad       tax   ptratio         b     lstat      medv 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"

#Distribution of class variables
y<-PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y)*100))

##     freq percentage
## neg  500  0.6510417
## pos  268  0.3489583

summary(PimaIndiansDiabetes$glucose)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   117.0   120.9   140.2   199.0

Datasets Processing

LG

2022-06-28