Insert Data and packages

pacman::p_load(dplyr)
pacman::p_load(VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

Q 1,2

sum(is.na(dirty_iris))
[1] 58

Q 3

summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width    Species         
 Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3   Mode  :character  
 Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf                     
 NA's   :10       NA's   :17       NA's   :19      NA's   :12                      

Q 4

filter(dirty_iris, Petal.Width == Inf) %>%
  replace(4, NA)

dirty_iris[which(dirty_iris$Petal.Width == Inf), "Petal.Width"] <- NA

Q 5

impossible_iris <-
  dirty_iris %>%
  subset(Sepal.Width < 0 | Sepal.Length > 30)

Q 6

#dirty_iris %>%
#  filter(Sepal.Width < 0) %>%
#  replace(2, NA)

dirty_iris[which(dirty_iris$Sepal.Width < 0), "Sepal.Width"] <- NA

summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.559   Mean   : 3.439   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :10       NA's   :18       NA's   :19      NA's   :13                        

Q 7 -> Linear Regression Model

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
            data = dirty_iris) #3: basic linear model
summary(dirty_iris) #3: always check
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.559   Mean   : 3.439   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :10       NA's   :18       NA's   :19      NA's   :13                        
I <- is.na(dirty_iris$Sepal.Length) #3: save missing values

dirty_iris$Sepal.Length[I] <- predict(model,
                         newdata = dirty_iris[I,]) #3: uses the model to predict values for Sepal.Length
summary(dirty_iris)
  Sepal.Length     Sepal.Width      Petal.Length    Petal.Width      Species         
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300   Class :character  
 Median : 5.800   Median : 3.000   Median : 4.50   Median :1.300   Mode  :character  
 Mean   : 6.554   Mean   : 3.439   Mean   : 4.45   Mean   :1.207                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800                     
 Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500                     
 NA's   :1        NA's   :18       NA's   :19      NA's   :13                        

Q7 -> Mean & Median

mean(dirty_iris$Sepal.Width, na.rm = TRUE)
[1] 3.393333
dirty_iris[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris[!is.na(dirty_iris$Sepal.Width)])
Error in `[.data.frame`(dirty_iris, !is.na(dirty_iris$Sepal.Width)) : 
  undefined columns selected

Q 7 -> kNN

dirty_iris2 <- kNN(dirty_iris) #3: function auto fills in w column mean
dirty_iris <- subset(dirty_iris2, select = Sepal.Length:Species) #3: generates extra rows
summary(dirty_iris) 
  Sepal.Length     Sepal.Width      Petal.Length     Petal.Width      Species         
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.100   Length:150        
 1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.600   1st Qu.:0.300   Class :character  
 Median : 5.800   Median : 3.000   Median : 4.400   Median :1.300   Mode  :character  
 Mean   : 6.543   Mean   : 3.393   Mean   : 4.307   Mean   :1.210                     
 3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.100   3rd Qu.:1.875                     
 Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :2.500                     
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBJbnNlcnQgRGF0YSBhbmQgcGFja2FnZXMKYGBge3J9CnBhY21hbjo6cF9sb2FkKGRwbHlyKQpwYWNtYW46OnBfbG9hZChWSU0pCmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCmBgYAojIyBRIDEsMgpgYGB7cn0Kc3VtKGlzLm5hKGRpcnR5X2lyaXMpKQpgYGAKIyMgUSAzCmBgYHtyfQpzdW1tYXJ5KGRpcnR5X2lyaXMpCmBgYAojIyBRIDQKYGBge3J9CmZpbHRlcihkaXJ0eV9pcmlzLCBQZXRhbC5XaWR0aCA9PSBJbmYpICU+JQogIHJlcGxhY2UoNCwgTkEpCgpkaXJ0eV9pcmlzW3doaWNoKGRpcnR5X2lyaXMkUGV0YWwuV2lkdGggPT0gSW5mKSwgIlBldGFsLldpZHRoIl0gPC0gTkEKYGBgCiMjIFEgNQpgYGB7cn0KaW1wb3NzaWJsZV9pcmlzIDwtCiAgZGlydHlfaXJpcyAlPiUKICBzdWJzZXQoU2VwYWwuV2lkdGggPCAwIHwgU2VwYWwuTGVuZ3RoID4gMzApCmBgYAojIyBRIDYKYGBge3J9CiNkaXJ0eV9pcmlzICU+JQojICBmaWx0ZXIoU2VwYWwuV2lkdGggPCAwKSAlPiUKIyAgcmVwbGFjZSgyLCBOQSkKCmRpcnR5X2lyaXNbd2hpY2goZGlydHlfaXJpcyRTZXBhbC5XaWR0aCA8IDApLCAiU2VwYWwuV2lkdGgiXSA8LSBOQQoKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKCiMjIFEgNyAtPiBMaW5lYXIgUmVncmVzc2lvbiBNb2RlbApgYGB7cn0KbW9kZWwgPC0gbG0oU2VwYWwuTGVuZ3RoIH4gU2VwYWwuV2lkdGggKyBQZXRhbC5MZW5ndGggKyBQZXRhbC5XaWR0aCwKICAgICAgICAgICAgZGF0YSA9IGRpcnR5X2lyaXMpICMzOiBiYXNpYyBsaW5lYXIgbW9kZWwKc3VtbWFyeShkaXJ0eV9pcmlzKSAjMzogYWx3YXlzIGNoZWNrCgpJIDwtIGlzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKSAjMzogc2F2ZSBtaXNzaW5nIHZhbHVlcwoKZGlydHlfaXJpcyRTZXBhbC5MZW5ndGhbSV0gPC0gcHJlZGljdChtb2RlbCwKICAgICAgICAgICAgICAgICAgICAgICAgIG5ld2RhdGEgPSBkaXJ0eV9pcmlzW0ksXSkgIzM6IHVzZXMgdGhlIG1vZGVsIHRvIHByZWRpY3QgdmFsdWVzIGZvciBTZXBhbC5MZW5ndGgKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKIyMgUTcgLT4gTWVhbiAmIE1lZGlhbgpgYGB7cn0KbWVhbihkaXJ0eV9pcmlzJFNlcGFsLldpZHRoLCBuYS5ybSA9IFRSVUUpCmRpcnR5X2lyaXNbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpc1shaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldKQoKbWVkaWFuKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoLCBuYS5ybSA9IFRSVUUpCmRpcnR5X2lyaXNbaXMubmEoZGlydHlfaXJpcyRQZXRhbC5XaWR0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzWyFpcy5uYShkaXJ0eV9pcmlzJFBldGFsLldpZHRoKV0pCgpzdW1tYXJ5KGRpcnR5X2lyaXMpCmBgYAoKIyMgUSA3IC0+IGtOTgpgYGB7cn0KZGlydHlfaXJpczIgPC0ga05OKGRpcnR5X2lyaXMpICMzOiBmdW5jdGlvbiBhdXRvIGZpbGxzIGluIHcgY29sdW1uIG1lYW4KZGlydHlfaXJpcyA8LSBzdWJzZXQoZGlydHlfaXJpczIsIHNlbGVjdCA9IFNlcGFsLkxlbmd0aDpTcGVjaWVzKSAjMzogZ2VuZXJhdGVzIGV4dHJhIHJvd3MKc3VtbWFyeShkaXJ0eV9pcmlzKSAKYGBg