library(tidyverse)
library(grid)
library(gridExtra)
library(forcats)
library(modelr)
library(caret)
library(kknn)
iris <- as_tibble(iris)
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Flower
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50 Min. : 1.00
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50 1st Qu.: 38.25
Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50 Median : 75.50
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199 Mean : 75.50
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800 3rd Qu.:112.75
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500 Max. :150.00
ggplot(iris, mapping = aes())+
geom_bar(mapping = aes(x=Species,fill=Species))
sapply(iris,class)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Flower
"numeric" "numeric" "numeric" "numeric" "factor" "integer"
ALSO could use the map() function (from the tidyverse)
map(iris,sd)
Calling var(x) on a factor x is deprecated and will become an error.
Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
$Sepal.Length
[1] 0.8280661
$Sepal.Width
[1] 0.4358663
$Petal.Length
[1] 1.765298
$Petal.Width
[1] 0.7622377
$Species
[1] 0.8192319
$Flower
[1] 43.44537
map(iris,mean)
argument is not numeric or logical: returning NA
$Sepal.Length
[1] 5.843333
$Sepal.Width
[1] 3.057333
$Petal.Length
[1] 3.758
$Petal.Width
[1] 1.199333
$Species
[1] NA
$Flower
[1] 75.5
# CREATE A FUNCTION
na_data <- function(x){
sum(is.na(x))/length(x)*100
}
#WHY DOES IT NEED THE /LENGTH*100??
na_data2 <- function(x){
sum(is.na(x))
}
#loop it over our dataset
apply(long_iris,2,na_data2)
Species part measure value
0 0 0 0
long_iris <- iris %>%
gather(key= 'part', value = 'value', Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
separate(part,c('part','measure'), sep = '\\.')
gather() - key = the column header for the columns (as row values) / value = the name of the values (the old rows) / the columns you want under the key separate() - split the ‘part’ column at the ‘.’ into two columns - ‘part’ and ‘measure’ note: needs to be \. to escape the first and then .
factors <- c('part','measure')
long_iris[factors] <- lapply(long_iris[factors],as.factor)