library(readr)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
#Question 3
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
#Question 4
dirtyiris_completel <- dirty_iris[complete.cases(dirty_iris),]
str(dirtyiris_completel)
## 'data.frame': 96 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
## $ Sepal.Width : num 3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
## $ Petal.Length: num 4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
## $ Petal.Width : num 1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
## $ Species : chr "versicolor" "virginica" "setosa" "versicolor" ...
dirtyiris_complete2 <- na.omit(dirty_iris)
str(dirtyiris_complete2)
## 'data.frame': 96 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
## $ Sepal.Width : num 3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
## $ Petal.Length: num 4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
## $ Petal.Width : num 1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
## $ Species : chr "versicolor" "virginica" "setosa" "versicolor" ...
## - attr(*, "na.action")= 'omit' Named int [1:54] 3 6 7 13 15 17 19 20 21 22 ...
## ..- attr(*, "names")= chr [1:54] "3" "6" "7" "13" ...
colSums(is.na(dirty_iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 10 17 19 12 0
#Question 5: checking for special values
any(is.na(dirty_iris))
## [1] TRUE
any(is.nan(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
## [1] FALSE
any(is.infinite(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
## [1] TRUE
#Question 6: changing special values
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
dirty_iris[which(is.infinite(dirty_iris$Petal.Width)), "Petal.Width"] <- NA
#Question 7 #The sepal width should be a positive value. #The sepal length of an iris cannot exceed 30 cm.
violations <- dirty_iris[which(dirty_iris$Sepal.Width <= 0 |
dirty_iris$Sepal.Length >= 30), ]
nrow(violations)
## [1] 4
#Question 8: Locate the observation that violates the rule of “Sepal.Width >0”
dirty_iris[dirty_iris$Sepal.Width <= 0, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.2 NA NA NA NA <NA>
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
dirty_iris[is.na(dirty_iris$Sepal.Width) | dirty_iris$Sepal.Width < 0, ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 3 6.2 NA 5.400 2.3 virginica
## 6 5.3 NA NA 0.2 setosa
## 17 5.5 NA 4.000 1.3 versicolor
## 20 5.6 NA 4.200 1.3 versicolor
## 22 5.4 NA 4.500 1.5 versicolor
## 41 5.0 NA 1.200 0.2 setosa
## 43 0.0 NA 1.300 0.4 setosa
## 47 5.7 NA NA 0.4 setosa
## 74 6.5 NA 4.600 1.5 versicolor
## 92 4.9 NA 3.300 1.0 versicolor
## 98 6.7 NA 5.000 1.7 versicolor
## 106 5.5 NA 0.925 1.0 versicolor
## 117 6.3 NA 4.400 1.3 versicolor
## 128 5.8 NA 5.100 2.4 virginica
## 130 5.7 NA 1.700 0.3 setosa
## 133 4.8 NA 1.900 0.2 setosa
## 141 5.0 NA 1.400 0.2 setosa
## 142 5.5 NA 3.800 1.1 versicolor
#Question 9: mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
#median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
#linear regression
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris,
subset = !is.na(Sepal.Length))
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <-
predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
#knn
#install.packages("VIM")
library("VIM")
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)