pacman::p_load(lubridate,stringr,VIM)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
complete <- sum(complete.cases(dirty_iris))
complete <- mean(complete.cases(dirty_iris))
bad_data <- str_subset(dirty_iris$Petal.Length, "a-zA-Z")
dirty_iris$Petal.Length[dirty_iris$Petal.Length == "missing"]
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

#Question 5

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Question 7

rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))

#Question 8

nrow(rules_violate)
## [1] 4
sum(dirty_iris$Sepal.Width <=0, na.rm = TRUE )
## [1] 2
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0] <-  NA

#Question 9

#Mean

dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <-
  mean(dirty_iris$Sepal.Width, na.rm = TRUE)

#Median

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <-
  median(dirty_iris$Petal.Length, na.rm = TRUE)

#LR

lm <- lm(Petal.Length ~ Sepal.Length + Sepal.Width,
            data = dirty_iris)

I <- is.na(dirty_iris$Petal.Length)

dirty_iris[I, "Petal.Lengths"] <- predict(lm,dirty_iris[I,])

summary(dirty_iris)
##   Sepal.Length     Sepal.Width      Petal.Length     Petal.Width 
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.700   1st Qu.:0.3  
##  Median : 5.750   Median : 3.100   Median : 4.500   Median :1.3  
##  Mean   : 6.559   Mean   : 3.439   Mean   : 4.456   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.439   3rd Qu.: 5.100   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.000   Max.   :Inf  
##  NA's   :10                                         NA's   :12   
##    Species          Petal.Lengths
##  Length:150         Min.   : NA  
##  Class :character   1st Qu.: NA  
##  Mode  :character   Median : NA  
##                     Mean   :NaN  
##                     3rd Qu.: NA  
##                     Max.   : NA  
##                     NA's   :150

#KNN

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
##  Sepal.Length   Sepal.Width  Petal.Length Petal.Lengths  Sepal.Length 
##             0             0             0             0            73 
##   Sepal.Width  Petal.Length Petal.Lengths 
##            30            63             1

```