library(readr)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

#Question 3

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

#Question 4

dirtyiris_completel <- dirty_iris[complete.cases(dirty_iris),]
str(dirtyiris_completel)
## 'data.frame':    96 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
##  $ Sepal.Width : num  3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
##  $ Petal.Length: num  4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
##  $ Petal.Width : num  1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
##  $ Species     : chr  "versicolor" "virginica" "setosa" "versicolor" ...
dirtyiris_complete2 <- na.omit(dirty_iris)
str(dirtyiris_complete2)
## 'data.frame':    96 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 5 5.7 5.9 5.8 4.8 5 6 6.8 ...
##  $ Sepal.Width : num  3.2 3.3 3.4 2.6 3 2.7 3.1 3.5 2.7 2.8 ...
##  $ Petal.Length: num  4.5 6 1.6 3.5 5.1 4.1 1.6 1.6 5.1 4.8 ...
##  $ Petal.Width : num  1.5 2.5 0.4 1 1.8 1 0.2 0.6 1.6 1.4 ...
##  $ Species     : chr  "versicolor" "virginica" "setosa" "versicolor" ...
##  - attr(*, "na.action")= 'omit' Named int [1:54] 3 6 7 13 15 17 19 20 21 22 ...
##   ..- attr(*, "names")= chr [1:54] "3" "6" "7" "13" ...
colSums(is.na(dirty_iris))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           17           19           12            0

#Question 5: checking for special values

any(is.na(dirty_iris))
## [1] TRUE
any(is.nan(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
## [1] FALSE
any(is.infinite(as.matrix(dirty_iris[, sapply(dirty_iris, is.numeric)])))
## [1] TRUE

#Question 6: changing special values

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
dirty_iris[which(is.infinite(dirty_iris$Petal.Width)), "Petal.Width"] <- NA

#Question 7 #The sepal width should be a positive value. #The sepal length of an iris cannot exceed 30 cm.

violations <- dirty_iris[which(dirty_iris$Sepal.Width <= 0 |
                               dirty_iris$Sepal.Length >= 30), ]

nrow(violations)
## [1] 4

#Question 8: Locate the observation that violates the rule of “Sepal.Width >0”

dirty_iris[dirty_iris$Sepal.Width <= 0, ]
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.2            NA          NA           NA          NA       <NA>
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- 
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])

dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
dirty_iris[is.na(dirty_iris$Sepal.Width) | dirty_iris$Sepal.Width < 0, ]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 3            6.2          NA        5.400         2.3  virginica
## 6            5.3          NA           NA         0.2     setosa
## 17           5.5          NA        4.000         1.3 versicolor
## 20           5.6          NA        4.200         1.3 versicolor
## 22           5.4          NA        4.500         1.5 versicolor
## 41           5.0          NA        1.200         0.2     setosa
## 43           0.0          NA        1.300         0.4     setosa
## 47           5.7          NA           NA         0.4     setosa
## 74           6.5          NA        4.600         1.5 versicolor
## 92           4.9          NA        3.300         1.0 versicolor
## 98           6.7          NA        5.000         1.7 versicolor
## 106          5.5          NA        0.925         1.0 versicolor
## 117          6.3          NA        4.400         1.3 versicolor
## 128          5.8          NA        5.100         2.4  virginica
## 130          5.7          NA        1.700         0.3     setosa
## 133          4.8          NA        1.900         0.2     setosa
## 141          5.0          NA        1.400         0.2     setosa
## 142          5.5          NA        3.800         1.1 versicolor

#Question 9: mean

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- 
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

#median

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- 
  median(dirty_iris$Petal.Length, na.rm = TRUE)

#linear regression

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
               data = dirty_iris, 
               subset = !is.na(Sepal.Length))

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- 
  predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])

#knn

#install.packages("VIM")
library("VIM")
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)