library(readr)
library(stringr)
pacman::p_install(VIM)
##
## The downloaded binary packages are in
## /var/folders/r1/q7btql916p792lphs8rfpvy40000gn/T//Rtmpr1ZLbs/downloaded_packages
##
## VIM installed
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
Question 1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
Question 2
sum(complete.cases(dirty_iris))
## [1] 96
nrow(dirty_iris)
## [1] 150
96/150 * 100
## [1] 64
Question 3
sapply(dirty_iris, function (x){
if(is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
## $Sepal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 10 0 0 0
##
## $Sepal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 17 0 0 0
##
## $Petal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 19 0 0 0
##
## $Petal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 12 0 1 0
##
## $Species
## NULL
Question 4
dirty_iris[dirty_iris == Inf] <- NA
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
sapply(dirty_iris, function (x){
if(is.numeric(x)) {
c(NA_values = sum(is.na(x)),
NaN_values = sum(is.nan(x)),
Inf_values = sum(x == Inf, na.rm = TRUE),
Neg_Inf_values = sum(x == -Inf, na.rm = TRUE))
}
})
## $Sepal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 10 0 0 0
##
## $Sepal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 17 0 0 0
##
## $Petal.Length
## NA_values NaN_values Inf_values Neg_Inf_values
## 19 0 0 0
##
## $Petal.Width
## NA_values NaN_values Inf_values Neg_Inf_values
## 13 0 0 0
##
## $Species
## NULL
Question 5
num_negative_sepal_width <- sum(dirty_iris$Sepal.Width <= 0, na.rm = TRUE)
cat("Number of observations with negative Sepal.Width:", num_negative_sepal_width, "\n")
## Number of observations with negative Sepal.Width: 2
num_large_sepal_length <- sum(dirty_iris$Sepal.Length > 30, na.rm = TRUE)
cat("Number of observations with Sepal.Length > 30:", num_large_sepal_length, "\n")
## Number of observations with Sepal.Length > 30: 2
Question 6
sepal_width_errors <- dirty_iris[dirty_iris$Sepal.Width <= 0,]
sepal_width_errors
abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0])
## [1] NA NA 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Question 7
dirty_iris[16,2] <- abs(dirty_iris[16, 2])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
mean(dirty_iris$Sepal.Width, na.rm = TRUE)
## [1] 3.462121
median(dirty_iris$Petal.Length, na.rm = TRUE)
## [1] 4.5
lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
## data = dirty_iris)
##
## Coefficients:
## (Intercept) Sepal.Width Petal.Length Petal.Width
## -0.3282 1.5127 0.1048 0.9860
kNN(dirty_iris, variable = "Petal.Width")