#Libraries requried

library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.3

#Loading in the data

For this exercise we are using the iris data set. The question that we are asking is “Can we identify the data using the sepal and pedal measurements?”.

data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

#Exploring the data

This will look through the data seeing if there is no variance between the species.

iris$NoVariance <-1
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species     NoVariance
##  setosa    :50   Min.   :1   
##  versicolor:50   1st Qu.:1   
##  virginica :50   Median :1   
##                  Mean   :1   
##                  3rd Qu.:1   
##                  Max.   :1

This chunk shows that this data set needs to be cleaned and remove or replace the NAs that are in the data set.

apply(iris, 2, var)
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species   NoVariance 
##    0.6856935    0.1899794    3.1162779    0.5810063           NA    0.0000000
which(apply(iris, 2, var) == 0)
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## NoVariance 
##          6
iris <- iris[ - as.numeric(which(apply(iris, 2, var) == 0))]
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

This shows the lengths for the sepal and pedeal length and width. It aslo shows the NAs that in the data set

iris$Sepal.Length[c(15, 20, 50, 67, 97, 118)] <- NA
iris$Sepal.Width[c(4, 80, 97, 106)] <- NA
iris$Petal.Length[c(5, 17, 35, 49)] <- NA
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.400   Median :1.300  
##  Mean   :5.844   Mean   :3.062   Mean   :3.822   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.375   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##  NA's   :6       NA's   :4       NA's   :4                      
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##                 
## 

This chunk shows the length width for the sepal and pedals

length(which(is.na(iris)))
## [1] 14
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 4            4.6          NA          1.5         0.2     setosa
## 5            5.0         3.6           NA         0.2     setosa
## 15            NA         4.0          1.2         0.2     setosa
## 17           5.4         3.9           NA         0.4     setosa
## 20            NA         3.8          1.5         0.3     setosa
## 35           4.9         3.1           NA         0.2     setosa
## 49           5.3         3.7           NA         0.2     setosa
## 50            NA         3.3          1.4         0.2     setosa
## 67            NA         3.0          4.5         1.5 versicolor
## 80           5.7          NA          3.5         1.0 versicolor
## 97            NA          NA          4.2         1.3 versicolor
## 106          7.6          NA          6.6         2.1  virginica
## 118           NA         3.8          6.7         2.2  virginica

#Cleaning the data

These two chunks below will clean the data set by replacing the NAs with numerical values for the sepal and pedal length and width.

iris_clean <- iris[complete.cases(iris), ]
length(which(is.na(iris_clean)))
## [1] 0
iris[is.na(iris$Sepal.Length) & (iris$Species == "setosa"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 4            4.6          NA          1.5         0.2     setosa
## 5            5.0         3.6           NA         0.2     setosa
## 17           5.4         3.9           NA         0.4     setosa
## 35           4.9         3.1           NA         0.2     setosa
## 49           5.3         3.7           NA         0.2     setosa
## 67            NA         3.0          4.5         1.5 versicolor
## 80           5.7          NA          3.5         1.0 versicolor
## 97            NA          NA          4.2         1.3 versicolor
## 106          7.6          NA          6.6         2.1  virginica
## 118           NA         3.8          6.7         2.2  virginica
iris[is.na(iris$Sepal.Length) & (iris$Species == "versicolor"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "versicolor")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 4           4.60          NA          1.5         0.2     setosa
## 5           5.00         3.6           NA         0.2     setosa
## 17          5.40         3.9           NA         0.4     setosa
## 35          4.90         3.1           NA         0.2     setosa
## 49          5.30         3.7           NA         0.2     setosa
## 80          5.70          NA          3.5         1.0 versicolor
## 97          5.95          NA          4.2         1.3 versicolor
## 106         7.60          NA          6.6         2.1  virginica
## 118           NA         3.8          6.7         2.2  virginica
iris[is.na(iris$Sepal.Length) & (iris$Species == "virginica"),"Sepal.Length"] <- median(iris$Sepal.Length[which(iris$Species == "virginica")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 4           4.60          NA          1.5         0.2     setosa
## 5           5.00         3.6           NA         0.2     setosa
## 17          5.40         3.9           NA         0.4     setosa
## 35          4.90         3.1           NA         0.2     setosa
## 49          5.30         3.7           NA         0.2     setosa
## 80          5.70          NA          3.5         1.0 versicolor
## 97          5.95          NA          4.2         1.3 versicolor
## 106         7.60          NA          6.6         2.1  virginica
iris[is.na(iris$Sepal.Width) & (iris$Species == "setosa"),"Sepal.Width"] <- median(iris$Sepal.Width[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 5           5.00         3.6           NA         0.2     setosa
## 17          5.40         3.9           NA         0.4     setosa
## 35          4.90         3.1           NA         0.2     setosa
## 49          5.30         3.7           NA         0.2     setosa
## 80          5.70          NA          3.5         1.0 versicolor
## 97          5.95          NA          4.2         1.3 versicolor
## 106         7.60          NA          6.6         2.1  virginica
iris[is.na(iris$Petal.Length) & (iris$Species == "setosa"),"Petal.Length"] <- median(iris$Petal.Length[which(iris$Species == "setosa")], na.rm = TRUE)
iris_NA <- iris[!complete.cases(iris), ]
iris_NA
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 80          5.70          NA          3.5         1.0 versicolor
## 97          5.95          NA          4.2         1.3 versicolor
## 106         7.60          NA          6.6         2.1  virginica
iris_set <- iris[, -5]
str(iris_set)
## 'data.frame':    150 obs. of  4 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.4 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.5 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
matriz_cor <- cor(iris_set)
matriz_cor
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000          NA    0.8778003   0.8250292
## Sepal.Width            NA           1           NA          NA
## Petal.Length    0.8778003          NA    1.0000000   0.9630281
## Petal.Width     0.8250292          NA    0.9630281   1.0000000
for (i in 1:nrow(matriz_cor)){
  correlations <-  which((matriz_cor[i,] > 0.85) & (matriz_cor[i,] != 1))
  
  if(length(correlations)> 0){
    print(colnames(iris_set)[i])
    print(correlations)
  }
}
## [1] "Sepal.Length"
## Petal.Length 
##            3 
## [1] "Petal.Length"
## Sepal.Length  Petal.Width 
##            1            4 
## [1] "Petal.Width"
## Petal.Length 
##            3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
corrplot(matriz_cor, method = "ellipse")

corr_check <- function(Dataset, threshold){
  matriz_cor <- cor(Dataset)
  matriz_cor

  for (i in 1:nrow(matriz_cor)){
    correlations <-  which((abs(matriz_cor[i,i:ncol(matriz_cor)]) > threshold) & (matriz_cor[i,i:ncol(matriz_cor)] != 1))
  
    if(length(correlations)> 0){
      lapply(correlations,FUN =  function(x) (cat(paste(colnames(Dataset)[i], "with",colnames(Dataset)[x]), "\n")))
     
    }
  }
}

corr_check(iris_set, 0.85)
## Sepal.Length with Petal.Length 
## Petal.Length with Sepal.Width