filename<-"C:/Users/18324/Documents/AI4OPT/R stats/iris.csv"
dataset<-read.csv(filename, header= FALSE)
#preview the first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
library(RCurl)
url<-"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
#download the file
downloaded<-getURL(url,ssl.verifypeer=FALSE)
connection<-textConnection(downloaded)
dataset<-read.csv(connection, header=FALSE)
#Preview first 5 rows
head(dataset)
## V1 V2 V3 V4 V5
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
data(iris)
par(mfrow=c(1,4))
for(i in 1:4){
hist(iris[,i], main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){ plot(density(iris[,i]), main=names(iris)[i])}

par(mfrow=c(1,4))
for(i in 1:4){boxplot(iris[,i], main=names(iris)[i])}

library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
data("Soybean")
missmap(Soybean, col=c("black", "grey"),legend=FALSE)
# MULTIVARIATE VISUALIZATION
library(corrplot)
## corrplot 0.92 loaded
data("iris")
correlations<-cor(iris[, 1:4])
pairs(iris)


pairs(Species~., data=iris, col=iris$Species)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

x<-iris[, 1:4]
y<-iris[,5]
scales<-list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y, plot="density", scales=scales)

featurePlot(x=x,y=y, plot="box")

#Section Peek At your Data
library(mlbench)
data("PimaIndiansDiabetes")
head(PimaIndiansDiabetes, n=20)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
## 7 3 78 50 32 88 31.0 0.248 26 pos
## 8 10 115 0 0 0 35.3 0.134 29 neg
## 9 2 197 70 45 543 30.5 0.158 53 pos
## 10 8 125 96 0 0 0.0 0.232 54 pos
## 11 4 110 92 0 0 37.6 0.191 30 neg
## 12 10 168 74 0 0 38.0 0.537 34 pos
## 13 10 139 80 0 0 27.1 1.441 57 neg
## 14 1 189 60 23 846 30.1 0.398 59 pos
## 15 5 166 72 19 175 25.8 0.587 51 pos
## 16 7 100 0 0 0 30.0 0.484 32 pos
## 17 0 118 84 47 230 45.8 0.551 31 pos
## 18 7 107 74 0 0 29.6 0.254 31 pos
## 19 1 103 30 38 83 43.3 0.183 33 neg
## 20 1 115 70 30 96 34.6 0.529 32 pos
dim(PimaIndiansDiabetes)
## [1] 768 9
data("BostonHousing")
sapply(BostonHousing, class)
## crim zn indus chas nox rm age dis
## "numeric" "numeric" "numeric" "factor" "numeric" "numeric" "numeric" "numeric"
## rad tax ptratio b lstat medv
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
#Distribution of class variables
y<-PimaIndiansDiabetes$diabetes
cbind(freq=table(y), percentage=prop.table(table(y)*100))
## freq percentage
## neg 500 0.6510417
## pos 268 0.3489583
summary(PimaIndiansDiabetes$glucose)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 117.0 120.9 140.2 199.0