Install the required packages

Import the required packages

library(tidyverse)
library(grid)
library(gridExtra)
library(forcats)
library(modelr)
library(caret)
library(kknn)

Data Cleaning

  1. Load the iris data into a tibble
  2. Get an overview of the data
  3. Coerce characters (strings) into factors (categorical data representation)
  4. Reformat tibble into a LONG format (for ease of visualisation)

Load Data into Tibble & Overview

iris <- as_tibble(iris)
summary(iris)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species       Flower      
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50   Min.   :  1.00  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50   1st Qu.: 38.25  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50   Median : 75.50  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                   Mean   : 75.50  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                   3rd Qu.:112.75  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500                   Max.   :150.00  

Explore the number of variables per class visually

ggplot(iris, mapping = aes())+
  geom_bar(mapping = aes(x=Species,fill=Species))

sapply() to check the class of variables

sapply(iris,class)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species       Flower 
   "numeric"    "numeric"    "numeric"    "numeric"     "factor"    "integer" 

ALSO could use the map() function (from the tidyverse)

map(iris,sd)
Calling var(x) on a factor x is deprecated and will become an error.
  Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
$Sepal.Length
[1] 0.8280661

$Sepal.Width
[1] 0.4358663

$Petal.Length
[1] 1.765298

$Petal.Width
[1] 0.7622377

$Species
[1] 0.8192319

$Flower
[1] 43.44537
map(iris,mean)
argument is not numeric or logical: returning NA
$Sepal.Length
[1] 5.843333

$Sepal.Width
[1] 3.057333

$Petal.Length
[1] 3.758

$Petal.Width
[1] 1.199333

$Species
[1] NA

$Flower
[1] 75.5

how many na values?

# CREATE A FUNCTION
na_data <- function(x){
  sum(is.na(x))/length(x)*100
}
#WHY DOES IT NEED THE /LENGTH*100??
na_data2 <- function(x){
  sum(is.na(x))
}
#loop it over our dataset
apply(long_iris,2,na_data2)
Species    part measure   value 
      0       0       0       0 

Reformat the tibble into LONG format

long_iris <- iris %>% 
  gather(key= 'part', value = 'value', Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
  separate(part,c('part','measure'), sep = '\\.')

gather() - key = the column header for the columns (as row values) / value = the name of the values (the old rows) / the columns you want under the key separate() - split the ‘part’ column at the ‘.’ into two columns - ‘part’ and ‘measure’ note: needs to be \. to escape the first  and then .

Coerce Characters into factors

factors <- c('part','measure')
long_iris[factors] <- lapply(long_iris[factors],as.factor)
LS0tCnRpdGxlOiAiSXJpcyBEYXRhIENsZWFuaW5nIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBJbnN0YWxsIHRoZSByZXF1aXJlZCBwYWNrYWdlcwpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQoKIyBpbnN0YWxsLnBhY2thZ2VzKCd0aWR5dmVyc2UnKQojIGluc3RhbGwucGFja2FnZXMoJ2dyaWRFeHRyYScpCiMgaW5zdGFsbC5wYWNrYWdlcygnZm9yY2F0cycpCiNpbnN0YWxsLnBhY2thZ2VzKCd2aXJpZGlzJykKI2luc3RhbGwucGFja2FnZXMoJ2NsYXNzJykKI2luc3RhbGwucGFja2FnZXMoJ21vZGVscicpCiNpbnN0YWxsLnBhY2thZ2VzKCdra25uJykKI2luc3RhbGwucGFja2FnZXMoJ3Bia3J0ZXN0JykKI2luc3RhbGwucGFja2FnZXMoJ2NhcmV0JykKCmBgYAoKIyMgSW1wb3J0IHRoZSByZXF1aXJlZCBwYWNrYWdlcyAKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGdyaWQpCmxpYnJhcnkoZ3JpZEV4dHJhKQpsaWJyYXJ5KGZvcmNhdHMpCmxpYnJhcnkobW9kZWxyKQpsaWJyYXJ5KGNhcmV0KQpsaWJyYXJ5KGtrbm4pCmBgYAoKIyBEYXRhIENsZWFuaW5nCjEpIExvYWQgdGhlIGlyaXMgZGF0YSBpbnRvIGEgdGliYmxlCjIpIEdldCBhbiBvdmVydmlldyBvZiB0aGUgZGF0YQozKSBDb2VyY2UgY2hhcmFjdGVycyAoc3RyaW5ncykgaW50byBmYWN0b3JzIChjYXRlZ29yaWNhbCBkYXRhIHJlcHJlc2VudGF0aW9uKQo0KSBSZWZvcm1hdCB0aWJibGUgaW50byBhIExPTkcgZm9ybWF0IChmb3IgZWFzZSBvZiB2aXN1YWxpc2F0aW9uKQoKIyMgTG9hZCBEYXRhIGludG8gVGliYmxlICYgT3ZlcnZpZXcKYGBge3J9CmlyaXMgPC0gYXNfdGliYmxlKGlyaXMpCnN1bW1hcnkoaXJpcykKYGBgCgojIyBFeHBsb3JlIHRoZSBudW1iZXIgb2YgdmFyaWFibGVzIHBlciBjbGFzcyB2aXN1YWxseQpgYGB7cn0KZ2dwbG90KGlyaXMsIG1hcHBpbmcgPSBhZXMoKSkrCiAgZ2VvbV9iYXIobWFwcGluZyA9IGFlcyh4PVNwZWNpZXMsZmlsbD1TcGVjaWVzKSkKYGBgCgojIyBzYXBwbHkoKSB0byBjaGVjayB0aGUgY2xhc3Mgb2YgdmFyaWFibGVzCmBgYHtyfQoKc2FwcGx5KGlyaXMsY2xhc3MpCgpgYGAKCkFMU08gY291bGQgdXNlIHRoZSBtYXAoKSBmdW5jdGlvbiAoZnJvbSB0aGUgdGlkeXZlcnNlKQoKYGBge3J9Cm1hcChpcmlzLHNkKQptYXAoaXJpcyxtZWFuKQpgYGAKCiMjIGhvdyBtYW55IG5hIHZhbHVlcz8KYGBge3J9CiMgQ1JFQVRFIEEgRlVOQ1RJT04KbmFfZGF0YSA8LSBmdW5jdGlvbih4KXsKICBzdW0oaXMubmEoeCkpL2xlbmd0aCh4KSoxMDAKfQojV0hZIERPRVMgSVQgTkVFRCBUSEUgL0xFTkdUSCoxMDA/PwoKbmFfZGF0YTIgPC0gZnVuY3Rpb24oeCl7CiAgc3VtKGlzLm5hKHgpKQp9CgojbG9vcCBpdCBvdmVyIG91ciBkYXRhc2V0CmFwcGx5KGxvbmdfaXJpcywyLG5hX2RhdGEyKQoKYGBgCgojIyBSZWZvcm1hdCB0aGUgdGliYmxlIGludG8gTE9ORyBmb3JtYXQKYGBge3J9CmxvbmdfaXJpcyA8LSBpcmlzICU+JSAKICBnYXRoZXIoa2V5PSAncGFydCcsIHZhbHVlID0gJ3ZhbHVlJywgU2VwYWwuTGVuZ3RoLCBTZXBhbC5XaWR0aCwgUGV0YWwuTGVuZ3RoLCBQZXRhbC5XaWR0aCkgJT4lCiAgc2VwYXJhdGUocGFydCxjKCdwYXJ0JywnbWVhc3VyZScpLCBzZXAgPSAnXFwuJykKYGBgCgpnYXRoZXIoKSAtIGtleSA9IHRoZSBjb2x1bW4gaGVhZGVyIGZvciB0aGUgY29sdW1ucyAoYXMgcm93IHZhbHVlcykgLyB2YWx1ZSA9IHRoZSBuYW1lIG9mIHRoZSB2YWx1ZXMgKHRoZSBvbGQgcm93cykgLyB0aGUgY29sdW1ucyB5b3Ugd2FudCB1bmRlciB0aGUga2V5IApzZXBhcmF0ZSgpIC0gc3BsaXQgdGhlICdwYXJ0JyBjb2x1bW4gYXQgdGhlICcuJyBpbnRvIHR3byBjb2x1bW5zIC0gJ3BhcnQnIGFuZCAnbWVhc3VyZScKbm90ZTogbmVlZHMgdG8gYmUgXFwuIHRvIGVzY2FwZSB0aGUgZmlyc3QgXCBhbmQgdGhlbiBcLgoKIyMgQ29lcmNlIENoYXJhY3RlcnMgaW50byBmYWN0b3JzCmBgYHtyfQpmYWN0b3JzIDwtIGMoJ3BhcnQnLCdtZWFzdXJlJykKbG9uZ19pcmlzW2ZhY3RvcnNdIDwtIGxhcHBseShsb25nX2lyaXNbZmFjdG9yc10sYXMuZmFjdG9yKQpgYGAK