df <- read.csv("Iris.csv", na.strings = "?")
head(df)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
str(df)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
summary(df)
## Id SepalLengthCm SepalWidthCm PetalLengthCm
## Min. : 1.00 Min. :4.300 Min. :2.000 Min. :1.000
## 1st Qu.: 38.25 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600
## Median : 75.50 Median :5.800 Median :3.000 Median :4.350
## Mean : 75.50 Mean :5.843 Mean :3.054 Mean :3.759
## 3rd Qu.:112.75 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100
## Max. :150.00 Max. :7.900 Max. :4.400 Max. :6.900
## PetalWidthCm Species
## Min. :0.100 Length:150
## 1st Qu.:0.300 Class :character
## Median :1.300 Mode :character
## Mean :1.199
## 3rd Qu.:1.800
## Max. :2.500
sapply(df, function(x) sum(is.na(x)))
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0 0 0 0 0
## Species
## 0
Zbiór danych jest kompletny
data<- df[,-1]
head(data)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
ggpairs(data, columns=1:4, aes(color=Species)) +
ggtitle("Iris Data wg gatunku")
Szerokość płatka (petal width) i długość płatka (petal length) wykazują korelację liniową.
set.seed(101)
irisCluster <- kmeans(data[,1:4], center=3, nstart=20)
irisCluster
## K-means clustering with 3 clusters of sizes 62, 38, 50
##
## Cluster means:
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.418000 1.464000 0.244000
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
## [112] 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2
## [149] 2 1
##
## Within cluster sum of squares by cluster:
## [1] 39.82097 23.87947 15.24040
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
i rysujemy dla nich wykresy:
library(cluster)
clusplot(iris, irisCluster$cluster, color=T, shade=T, labels=0, lines=0)
w proporcji 80:20
id <- createDataPartition(data$Species, p=0.80, list=FALSE)
train <- data[id,]
dim(train)
## [1] 120 5
test <- data[-id,]
dim(test)
## [1] 30 5
head(test)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
## 9 4.4 2.9 1.4 0.2 Iris-setosa
## 10 4.9 3.1 1.5 0.1 Iris-setosa
## 13 4.8 3.0 1.4 0.1 Iris-setosa
## 29 5.2 3.4 1.4 0.2 Iris-setosa
head(train)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 7 4.6 3.4 1.4 0.3 Iris-setosa
## 8 5.0 3.4 1.5 0.2 Iris-setosa
lapply(train, function(x) length(unique(x)))
## $SepalLengthCm
## [1] 35
##
## $SepalWidthCm
## [1] 22
##
## $PetalLengthCm
## [1] 40
##
## $PetalWidthCm
## [1] 22
##
## $Species
## [1] 3
set.seed(101)
cart_model <- train(train[,1:4], train[, 5], method='rpart2')
## note: only 2 possible values of the max tree depth from the initial fit.
## Truncating the grid to 2 .
predictions<-predict(cart_model,test[,1:4])
table(predictions)
## predictions
## Iris-setosa Iris-versicolor Iris-virginica
## 10 12 8
#feature importance
importance_cart <- varImp(cart_model)
plot(importance_cart, main="Ważność zmiennych cart_model")
Zgodnie z przewidywaniami, petal width i petal length miały największy wpływ na klasyfikację
set.seed(101)
knn_model <- train(train[, 1:4], train[, 5], method='knn',
preProcess=c("center", "scale"))
predictions<-predict(knn_model,test[,1:4])
table(predictions)
## predictions
## Iris-setosa Iris-versicolor Iris-virginica
## 10 12 8
importance_knn <- varImp(knn_model)
plot(importance_knn, main="Wpływ zmiennych na knn_model")
W tym modelu zmienne petal width, petal length oraz dodatkowo sepal lenght (długość działki)miały największy wpływ na klasyfikację.
set.seed(101)
nnet_model <- train(train[, 1:4], train[, 5], method='nnet',
preProcess=c("center", "scale"),
tuneLength = 2,
trace = FALSE,
maxit = 100)
predictions<-predict(nnet_model,test[,1:4], type="raw")
importance_nnet <- varImp(nnet_model);importance_nnet
## nnet variable importance
##
## variables are sorted by maximum importance across the classes
## Overall Iris-setosa Iris-versicolor Iris-virginica
## PetalLengthCm 1.000e+02 1.000e+02 100.00 1.000e+02
## PetalWidthCm 9.334e+01 9.334e+01 93.34 9.334e+01
## SepalWidthCm 2.454e+01 2.454e+01 24.54 2.454e+01
## SepalLengthCm 6.883e-15 6.883e-15 0.00 6.883e-15
models_compare <- resamples(list(cart_model,knn_model,nnet_model))
summary(models_compare)
##
## Call:
## summary.resamples(object = models_compare)
##
## Models: Model1, Model2, Model3
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.8837209 0.9302326 0.9387755 0.9465999 0.9705882 1 0
## Model2 0.8604651 0.9318182 0.9545455 0.9479642 0.9750000 1 0
## Model3 0.8863636 0.9534884 0.9600000 0.9608532 0.9777778 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Model1 0.8234811 0.8921405 0.9050388 0.9182731 0.9531034 1 0
## Model2 0.7892157 0.8944844 0.9312500 0.9207528 0.9609375 1 0
## Model3 0.8291925 0.9284526 0.9397590 0.9403740 0.9665211 1 0
Z powyższch danych wynika, że trzeci model - sieć neuronowa cechuje się największą dokładnością przewidywań.