#Libraries requried
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.3
#Loading in the data
For this exercise we are using the iris data set. The question that we are asking is “Can we identify the data using the sepal and pedal measurements?”.
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#Exploring the data
This will look through the data seeing if there is no variance between the species.
iris$NoVariance <-1
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species NoVariance
## setosa :50 Min. :1
## versicolor:50 1st Qu.:1
## virginica :50 Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
This chunk shows that this data set needs to be cleaned and remove or replace the NAs that are in the data set.
apply(iris, 2, var)
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species NoVariance
## 0.6856935 0.1899794 3.1162779 0.5810063 NA 0.0000000
which(apply(iris, 2, var) == 0)
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## NoVariance
## 6
iris <- iris[ - as.numeric(which(apply(iris, 2, var) == 0))]
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
This shows the lengths for the sepal and pedeal length and width. It aslo shows the NAs that in the data set
iris$Sepal.Length[c(15, 20, 50, 67, 97, 118)] <- NA
iris$Sepal.Width[c(4, 80, 97, 106)] <- NA
iris$Petal.Length[c(5, 17, 35, 49)] <- NA
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.400 Median :1.300
## Mean :5.844 Mean :3.062 Mean :3.822 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.375 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## NA's :6 NA's :4 NA's :4
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
##
This chunk shows the length width for the sepal and pedals
length(which(is.na(iris)))
## [1] 14
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 4 4.6 NA 1.5 0.2 setosa
## 5 5.0 3.6 NA 0.2 setosa
## 15 NA 4.0 1.2 0.2 setosa
## 17 5.4 3.9 NA 0.4 setosa
## 20 NA 3.8 1.5 0.3 setosa
## 35 4.9 3.1 NA 0.2 setosa
## 49 5.3 3.7 NA 0.2 setosa
## 50 NA 3.3 1.4 0.2 setosa
## 67 NA 3.0 4.5 1.5 versicolor
## 80 5.7 NA 3.5 1.0 versicolor
## 97 NA NA 4.2 1.3 versicolor
## 106 7.6 NA 6.6 2.1 virginica
## 118 NA 3.8 6.7 2.2 virginica
#Cleaning the data
These two chunks below will clean the data set by replacing the NAs with numerical values for the sepal and pedal length and width.
iris_clean <- iris[complete.cases(iris), ]
length(which(is.na(iris_clean)))
## [1] 0
iris[is.na(iris$Sepal.Length) & (iris$Species == "setosa"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 4 4.6 NA 1.5 0.2 setosa
## 5 5.0 3.6 NA 0.2 setosa
## 17 5.4 3.9 NA 0.4 setosa
## 35 4.9 3.1 NA 0.2 setosa
## 49 5.3 3.7 NA 0.2 setosa
## 67 NA 3.0 4.5 1.5 versicolor
## 80 5.7 NA 3.5 1.0 versicolor
## 97 NA NA 4.2 1.3 versicolor
## 106 7.6 NA 6.6 2.1 virginica
## 118 NA 3.8 6.7 2.2 virginica
iris[is.na(iris$Sepal.Length) & (iris$Species == "versicolor"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "versicolor")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 4 4.60 NA 1.5 0.2 setosa
## 5 5.00 3.6 NA 0.2 setosa
## 17 5.40 3.9 NA 0.4 setosa
## 35 4.90 3.1 NA 0.2 setosa
## 49 5.30 3.7 NA 0.2 setosa
## 80 5.70 NA 3.5 1.0 versicolor
## 97 5.95 NA 4.2 1.3 versicolor
## 106 7.60 NA 6.6 2.1 virginica
## 118 NA 3.8 6.7 2.2 virginica
iris[is.na(iris$Sepal.Length) & (iris$Species == "virginica"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "virginica")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 4 4.60 NA 1.5 0.2 setosa
## 5 5.00 3.6 NA 0.2 setosa
## 17 5.40 3.9 NA 0.4 setosa
## 35 4.90 3.1 NA 0.2 setosa
## 49 5.30 3.7 NA 0.2 setosa
## 80 5.70 NA 3.5 1.0 versicolor
## 97 5.95 NA 4.2 1.3 versicolor
## 106 7.60 NA 6.6 2.1 virginica
iris[is.na(iris$Sepal.Width) & (iris$Species == "setosa"),"Sepal.Width"] <- median(iris$Sepal.Width[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 5 5.00 3.6 NA 0.2 setosa
## 17 5.40 3.9 NA 0.4 setosa
## 35 4.90 3.1 NA 0.2 setosa
## 49 5.30 3.7 NA 0.2 setosa
## 80 5.70 NA 3.5 1.0 versicolor
## 97 5.95 NA 4.2 1.3 versicolor
## 106 7.60 NA 6.6 2.1 virginica
iris[is.na(iris$Petal.Length) & (iris$Species == "setosa"),"Petal.Length"] <- median(iris$Petal.Length[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 80 5.70 NA 3.5 1.0 versicolor
## 97 5.95 NA 4.2 1.3 versicolor
## 106 7.60 NA 6.6 2.1 virginica
iris_set <- iris[, -5]
str(iris_set)
## 'data.frame': 150 obs. of 4 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.4 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.5 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
matriz_cor <- cor(iris_set)
matriz_cor
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 NA 0.8778003 0.8250292
## Sepal.Width NA 1 NA NA
## Petal.Length 0.8778003 NA 1.0000000 0.9630281
## Petal.Width 0.8250292 NA 0.9630281 1.0000000
for (i in 1:nrow(matriz_cor)){
correlations <- which((matriz_cor[i,] > 0.85) & (matriz_cor[i,] != 1))
if(length(correlations)> 0){
print(colnames(iris_set)[i])
print(correlations)
}
}
## [1] "Sepal.Length"
## Petal.Length
## 3
## [1] "Petal.Length"
## Sepal.Length Petal.Width
## 1 4
## [1] "Petal.Width"
## Petal.Length
## 3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
corrplot(matriz_cor, method = "ellipse")
corr_check <- function(Dataset, threshold){
matriz_cor <- cor(Dataset)
matriz_cor
for (i in 1:nrow(matriz_cor)){
correlations <- which((abs(matriz_cor[i,i:ncol(matriz_cor)]) > threshold) & (matriz_cor[i,i:ncol(matriz_cor)] != 1))
if(length(correlations)> 0){
lapply(correlations,FUN = function(x) (cat(paste(colnames(Dataset)[i], "with",colnames(Dataset)[x]), "\n")))
}
}
}
corr_check(iris_set, 0.85)
## Sepal.Length with Petal.Length
## Petal.Length with Sepal.Width