#Q1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
#Q2
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
#Q3
num_complete <- sum(complete.cases(dirty_iris))
percent_complete <- (num_complete / nrow(dirty_iris)) * 100
num_complete
## [1] 96
percent_complete
## [1] 64
#Q4
dirty_iris[dirty_iris == Inf] <- NA
#Q5
violations <- dirty_iris[!is.na(dirty_iris$Sepal.Width) & (dirty_iris$Sepal.Width < 0 | dirty_iris$Sepal.Width > 30), ]
violations
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5 -3 3.5 1 versicolor
#Q6
# zeros
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
# neg values
dirty_iris$Sepal.Width <- abs(dirty_iris$Sepal.Width)
#Q7
options(repos = c(CRAN = "https://cran.rstudio.com"))
install.packages("VIM")
##
## The downloaded binary packages are in
## /var/folders/w9/14mq59ws1v5gjp0qx287vg1r0000gn/T//RtmpCX6ntB/downloaded_packages
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris <- iris
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = dirty_iris)
missing <- is.na(dirty_iris$Sepal.Length)
dirty_iris$Sepal.Length[missing] <- predict(model, newdata = dirty_iris[missing, ])
iris_knn <- kNN(dirty_iris, k = 5)
## Warning in kNN(dirty_iris, k = 5): Nothing to impute, because no NA are present
## (also after using makeNA)
iris_knn <- iris_knn[, 1:ncol(dirty_iris)]
summary(iris_knn)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris_knn)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa