pacman::p_load(lubridate,stringr,VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
complete <- sum(complete.cases(dirty_iris))
complete <- mean(complete.cases(dirty_iris))
bad_data <- str_subset(dirty_iris$Petal.Length, "a-zA-Z")
dirty_iris$Petal.Length[dirty_iris$Petal.Length == "missing"]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Question 5
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))
#Question 8
nrow(rules_violate)
## [1] 4
sum(dirty_iris$Sepal.Width <=0, na.rm = TRUE )
## [1] 2
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0] <- NA
#Question 9
#Mean
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
#Median
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
median(dirty_iris$Petal.Length, na.rm = TRUE)
#LR
lm <- lm(Petal.Length ~ Sepal.Length + Sepal.Width,
data = dirty_iris)
I <- is.na(dirty_iris$Petal.Length)
dirty_iris[I, "Petal.Lengths"] <- predict(lm,dirty_iris[I,])
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.700 1st Qu.:0.3
## Median : 5.750 Median : 3.100 Median : 4.500 Median :1.3
## Mean : 6.559 Mean : 3.439 Mean : 4.456 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.439 3rd Qu.: 5.100 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.000 Max. :Inf
## NA's :10 NA's :12
## Species Petal.Lengths
## Length:150 Min. : NA
## Class :character 1st Qu.: NA
## Mode :character Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :150
#KNN
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
## Sepal.Length Sepal.Width Petal.Length Petal.Lengths Sepal.Length
## 0 0 0 0 73
## Sepal.Width Petal.Length Petal.Lengths
## 30 63 1
```