Insert Data and packages
pacman::p_load(dplyr)
pacman::p_load(VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
Q 1,2
sum(is.na(dirty_iris))
[1] 58
Q 3
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3 Mode :character
Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
NA's :10 NA's :17 NA's :19 NA's :12
Q 4
filter(dirty_iris, Petal.Width == Inf) %>%
replace(4, NA)
dirty_iris[which(dirty_iris$Petal.Width == Inf), "Petal.Width"] <- NA
Q 5
impossible_iris <-
dirty_iris %>%
subset(Sepal.Width < 0 | Sepal.Length > 30)
Q 6
#dirty_iris %>%
# filter(Sepal.Width < 0) %>%
# replace(2, NA)
dirty_iris[which(dirty_iris$Sepal.Width < 0), "Sepal.Width"] <- NA
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.559 Mean : 3.439 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :10 NA's :18 NA's :19 NA's :13
Q 7 -> Linear Regression Model
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
data = dirty_iris) #3: basic linear model
summary(dirty_iris) #3: always check
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.559 Mean : 3.439 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :10 NA's :18 NA's :19 NA's :13
I <- is.na(dirty_iris$Sepal.Length) #3: save missing values
dirty_iris$Sepal.Length[I] <- predict(model,
newdata = dirty_iris[I,]) #3: uses the model to predict values for Sepal.Length
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300 Class :character
Median : 5.800 Median : 3.000 Median : 4.50 Median :1.300 Mode :character
Mean : 6.554 Mean : 3.439 Mean : 4.45 Mean :1.207
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
NA's :1 NA's :18 NA's :19 NA's :13
Q 7 -> kNN
dirty_iris2 <- kNN(dirty_iris) #3: function auto fills in w column mean
dirty_iris <- subset(dirty_iris2, select = Sepal.Length:Species) #3: generates extra rows
summary(dirty_iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.100 Length:150
1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.600 1st Qu.:0.300 Class :character
Median : 5.800 Median : 3.000 Median : 4.400 Median :1.300 Mode :character
Mean : 6.543 Mean : 3.393 Mean : 4.307 Mean :1.210
3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.100 3rd Qu.:1.875
Max. :73.000 Max. :30.000 Max. :63.000 Max. :2.500
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBJbnNlcnQgRGF0YSBhbmQgcGFja2FnZXMKYGBge3J9CnBhY21hbjo6cF9sb2FkKGRwbHlyKQpwYWNtYW46OnBfbG9hZChWSU0pCmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCmBgYAojIyBRIDEsMgpgYGB7cn0Kc3VtKGlzLm5hKGRpcnR5X2lyaXMpKQpgYGAKIyMgUSAzCmBgYHtyfQpzdW1tYXJ5KGRpcnR5X2lyaXMpCmBgYAojIyBRIDQKYGBge3J9CmZpbHRlcihkaXJ0eV9pcmlzLCBQZXRhbC5XaWR0aCA9PSBJbmYpICU+JQogIHJlcGxhY2UoNCwgTkEpCgpkaXJ0eV9pcmlzW3doaWNoKGRpcnR5X2lyaXMkUGV0YWwuV2lkdGggPT0gSW5mKSwgIlBldGFsLldpZHRoIl0gPC0gTkEKYGBgCiMjIFEgNQpgYGB7cn0KaW1wb3NzaWJsZV9pcmlzIDwtCiAgZGlydHlfaXJpcyAlPiUKICBzdWJzZXQoU2VwYWwuV2lkdGggPCAwIHwgU2VwYWwuTGVuZ3RoID4gMzApCmBgYAojIyBRIDYKYGBge3J9CiNkaXJ0eV9pcmlzICU+JQojICBmaWx0ZXIoU2VwYWwuV2lkdGggPCAwKSAlPiUKIyAgcmVwbGFjZSgyLCBOQSkKCmRpcnR5X2lyaXNbd2hpY2goZGlydHlfaXJpcyRTZXBhbC5XaWR0aCA8IDApLCAiU2VwYWwuV2lkdGgiXSA8LSBOQQoKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKCiMjIFEgNyAtPiBMaW5lYXIgUmVncmVzc2lvbiBNb2RlbApgYGB7cn0KbW9kZWwgPC0gbG0oU2VwYWwuTGVuZ3RoIH4gU2VwYWwuV2lkdGggKyBQZXRhbC5MZW5ndGggKyBQZXRhbC5XaWR0aCwKICAgICAgICAgICAgZGF0YSA9IGRpcnR5X2lyaXMpICMzOiBiYXNpYyBsaW5lYXIgbW9kZWwKc3VtbWFyeShkaXJ0eV9pcmlzKSAjMzogYWx3YXlzIGNoZWNrCgpJIDwtIGlzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKSAjMzogc2F2ZSBtaXNzaW5nIHZhbHVlcwoKZGlydHlfaXJpcyRTZXBhbC5MZW5ndGhbSV0gPC0gcHJlZGljdChtb2RlbCwKICAgICAgICAgICAgICAgICAgICAgICAgIG5ld2RhdGEgPSBkaXJ0eV9pcmlzW0ksXSkgIzM6IHVzZXMgdGhlIG1vZGVsIHRvIHByZWRpY3QgdmFsdWVzIGZvciBTZXBhbC5MZW5ndGgKc3VtbWFyeShkaXJ0eV9pcmlzKQpgYGAKIyMgUTcgLT4gTWVhbiAmIE1lZGlhbgpgYGB7cn0KbWVhbihkaXJ0eV9pcmlzJFNlcGFsLldpZHRoLCBuYS5ybSA9IFRSVUUpCmRpcnR5X2lyaXNbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpc1shaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldKQoKbWVkaWFuKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoLCBuYS5ybSA9IFRSVUUpCmRpcnR5X2lyaXNbaXMubmEoZGlydHlfaXJpcyRQZXRhbC5XaWR0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzWyFpcy5uYShkaXJ0eV9pcmlzJFBldGFsLldpZHRoKV0pCgpzdW1tYXJ5KGRpcnR5X2lyaXMpCmBgYAoKIyMgUSA3IC0+IGtOTgpgYGB7cn0KZGlydHlfaXJpczIgPC0ga05OKGRpcnR5X2lyaXMpICMzOiBmdW5jdGlvbiBhdXRvIGZpbGxzIGluIHcgY29sdW1uIG1lYW4KZGlydHlfaXJpcyA8LSBzdWJzZXQoZGlydHlfaXJpczIsIHNlbGVjdCA9IFNlcGFsLkxlbmd0aDpTcGVjaWVzKSAjMzogZ2VuZXJhdGVzIGV4dHJhIHJvd3MKc3VtbWFyeShkaXJ0eV9pcmlzKSAKYGBg