Question 1
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
missing_values <- sum(is.na(dirty_iris$Petal.Length))
print(paste("Number of missing values in Petal.Length:", missing_values))
## [1] "Number of missing values in Petal.Length: 19"
Question 2
complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris)
percentage_complete <- (complete_obs / total_obs) * 100
print(paste("Number of complete observations:", complete_obs))
## [1] "Number of complete observations: 96"
print(paste("Percentage of complete observations:", round(percentage_complete, 2), "%"))
## [1] "Percentage of complete observations: 64 %"
Question 3
# Select numeric columns
numeric_columns <- sapply(dirty_iris, is.numeric)
# Count missing values (NA)
missing_values <- colSums(is.na(dirty_iris[, numeric_columns]))
# Count Inf and -Inf values
infinite_values <- colSums(sapply(dirty_iris[, numeric_columns], is.infinite))
# Count NaN values
nan_values <- colSums(sapply(dirty_iris[, numeric_columns], is.nan))
# Display results
print("Missing values (NA):")
## [1] "Missing values (NA):"
print(missing_values)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 10 17 19 12
print("Infinite values (Inf/-Inf):")
## [1] "Infinite values (Inf/-Inf):"
print(infinite_values)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 1
print("NaN values:")
## [1] "NaN values:"
print(nan_values)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0 0 0 0
print("Unique values in numeric columns:")
## [1] "Unique values in numeric columns:"
sapply(dirty_iris[, numeric_columns], unique)
## $Sepal.Length
## [1] 6.4 6.3 6.2 5.0 5.7 5.3 5.9 5.8 4.8 6.0 6.8 NA 5.5 4.7 5.6
## [16] 4.9 5.4 6.7 4.6 73.0 6.5 4.4 6.6 0.0 7.7 7.0 5.2 5.1 7.4 4.3
## [31] 7.2 6.1 6.9 7.6 7.9 4.5 49.0
##
## $Sepal.Width
## [1] 3.2 3.3 NA 3.4 2.6 2.7 3.0 3.1 3.5 2.8 3.9 -3.0 4.0 3.6 29.0
## [16] 2.9 2.2 2.5 4.2 3.8 2.3 3.7 4.1 30.0 0.0
##
## $Petal.Length
## [1] 4.500 6.000 5.400 1.600 3.500 NA 5.300 5.100 4.100 4.800
## [11] 1.700 4.000 1.300 4.200 5.700 5.900 1.400 1.500 63.000 0.820
## [21] 23.000 5.500 5.800 1.200 3.900 6.700 4.700 4.400 5.600 3.300
## [31] 6.100 1.100 4.900 4.600 6.600 0.000 6.400 5.200 1.900 4.300
## [41] 5.000 0.925 6.900 14.000 3.600 3.800
##
## $Petal.Width
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 NA 1.8 0.6 1.6 1.4 1.3 0.1 2.1 2.0 1.2 1.9 2.2 0.3
## [20] 1.1 Inf 1.7 2.4 0.5
Question 4
# Read the dataset
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
# Locate the 'Inf' values in numeric columns
inf_locations <- sapply(dirty_iris, function(x) any(is.infinite(x)))
# Print which columns contain Inf values
print("Columns containing Inf values:")
## [1] "Columns containing Inf values:"
print(names(dirty_iris)[inf_locations])
## [1] "Petal.Width"
# Replace Inf with NA in the entire dataset
dirty_iris[dirty_iris == Inf] <- NA
# Verify replacement (checking Petal.Width column specifically)
print("Updated Petal.Width column:")
## [1] "Updated Petal.Width column:"
print(dirty_iris$Petal.Width)
## [1] 1.5 2.5 2.3 0.4 1.0 0.2 NA 1.8 1.0 0.2 0.6 1.6 NA 1.4 0.4 1.0 1.3 0.2
## [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1 NA 2.0 1.3 0.2 NA 1.3 1.3 1.3 0.1
## [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
## [55] 1.6 2.2 0.3 1.5 NA 2.1 NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3 NA 0.3 1.4 2.3
## [73] 0.3 1.5 2.0 NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3 NA 1.3 1.3 0.2 1.9
## [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5 NA 1.3 0.4 1.0 2.0 NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5 NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8 NA
Question 5
violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]
print(violations)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## NA NA NA NA NA <NA>
## NA.1 NA NA NA NA <NA>
## NA.2 NA NA NA NA <NA>
## 16 5.0 -3 3.5 1.0 versicolor
## NA.3 NA NA NA NA <NA>
## NA.4 NA NA NA NA <NA>
## NA.5 NA NA NA NA <NA>
## NA.6 NA NA NA NA <NA>
## NA.7 NA NA NA NA <NA>
## 28 73.0 29 63.0 NA virginica
## NA.8 NA NA NA NA <NA>
## NA.9 NA NA NA NA <NA>
## NA.10 NA NA NA NA <NA>
## NA.11 NA NA NA NA <NA>
## NA.12 NA NA NA NA <NA>
## NA.13 NA NA NA NA <NA>
## NA.14 NA NA NA NA <NA>
## NA.15 NA NA NA NA <NA>
## NA.16 NA NA NA NA <NA>
## NA.17 NA NA NA NA <NA>
## NA.18 NA NA NA NA <NA>
## NA.19 NA NA NA NA <NA>
## NA.20 NA NA NA NA <NA>
## NA.21 NA NA NA NA <NA>
## 125 49.0 30 14.0 2.0 setosa
## NA.22 NA NA NA NA <NA>
## 130 5.7 0 1.7 0.3 setosa
## NA.23 NA NA NA NA <NA>
## NA.24 NA NA NA NA <NA>
## NA.25 NA NA NA NA <NA>
## NA.26 NA NA NA NA <NA>
num_violations <- nrow(violations)
cat("Number of observations violating the rules:", num_violations, "\n")
## Number of observations violating the rules: 31
Question 6
invalid_sepal_width <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)
print(dirty_iris[invalid_sepal_width, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 130 5.7 0 1.7 0.3 setosa
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <-
abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA
print(dirty_iris[invalid_sepal_width, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 3 3.5 1.0 versicolor
## 130 5.7 NA 1.7 0.3 setosa
cat("Error correction completed. Sepal.Width rule violations have been fixed.\n")
## Error correction completed. Sepal.Width rule violations have been fixed.
Question 7
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
lm_model <- lm(Sepal.Length ~ Petal.Length + Petal.Width + Sepal.Width, data = dirty_iris, na.action = na.exclude)
predicted_values <- predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values
dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)
print(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.400000 3.200000 4.500 1.5 versicolor
## 2 6.300000 3.300000 6.000 2.5 virginica
## 3 6.200000 3.462121 5.400 2.3 virginica
## 4 5.000000 3.400000 1.600 0.4 setosa
## 5 5.700000 2.600000 3.500 1.0 versicolor
## 6 5.300000 3.462121 4.500 0.2 setosa
## 7 6.400000 2.700000 5.300 1.9 virginica
## 8 5.900000 3.000000 5.100 1.8 virginica
## 9 5.800000 2.700000 4.100 1.0 versicolor
## 10 4.800000 3.100000 1.600 0.2 setosa
## 11 5.000000 3.500000 1.600 0.6 setosa
## 12 6.000000 2.700000 5.100 1.6 versicolor
## 13 6.000000 3.000000 4.800 1.8 virginica
## 14 6.800000 2.800000 4.800 1.4 versicolor
## 15 5.925959 3.900000 1.700 0.4 setosa
## 16 5.000000 3.000000 3.500 1.0 versicolor
## 17 5.500000 3.462121 4.000 1.3 versicolor
## 18 4.700000 3.200000 1.300 0.2 setosa
## 19 6.223562 4.000000 4.500 0.2 setosa
## 20 5.600000 3.462121 4.200 1.3 versicolor
## 21 4.900000 3.600000 4.500 0.1 setosa
## 22 5.400000 3.462121 4.500 1.5 versicolor
## 23 6.200000 2.800000 4.500 1.8 virginica
## 24 6.700000 3.300000 5.700 2.5 virginica
## 25 6.761731 3.000000 5.900 2.1 virginica
## 26 4.600000 3.200000 1.400 0.2 setosa
## 27 4.900000 3.100000 1.500 0.1 setosa
## 28 73.000000 29.000000 63.000 2.0 virginica
## 29 6.500000 3.200000 5.100 2.0 virginica
## 30 5.050877 2.800000 0.820 1.3 versicolor
## 31 4.400000 3.200000 4.500 0.2 setosa
## 32 5.900000 3.200000 4.800 1.5 versicolor
## 33 5.700000 2.800000 4.500 1.3 versicolor
## 34 6.200000 2.900000 4.500 1.3 versicolor
## 35 6.600000 2.900000 23.000 1.3 versicolor
## 36 4.800000 3.000000 1.400 0.1 setosa
## 37 6.500000 3.000000 5.500 1.8 virginica
## 38 6.200000 2.200000 4.500 1.5 versicolor
## 39 6.700000 2.500000 5.800 1.8 virginica
## 40 5.000000 3.000000 1.600 0.2 setosa
## 41 5.000000 3.462121 1.200 0.2 setosa
## 42 5.800000 2.700000 3.900 1.2 versicolor
## 43 0.000000 3.462121 1.300 0.4 setosa
## 44 5.800000 2.700000 5.100 1.9 virginica
## 45 5.500000 4.200000 1.400 0.2 setosa
## 46 7.700000 2.800000 6.700 2.0 virginica
## 47 5.700000 3.462121 4.500 0.4 setosa
## 48 7.000000 3.200000 4.700 1.4 versicolor
## 49 6.500000 3.000000 5.800 2.2 virginica
## 50 6.000000 3.400000 4.500 1.6 versicolor
## 51 5.500000 2.600000 4.400 1.2 versicolor
## 52 4.900000 3.100000 4.500 0.2 setosa
## 53 5.200000 2.700000 3.900 1.4 versicolor
## 54 4.800000 3.400000 1.600 0.2 setosa
## 55 6.300000 3.300000 4.700 1.6 versicolor
## 56 7.700000 3.800000 6.700 2.2 virginica
## 57 5.100000 3.800000 1.500 0.3 setosa
## 58 5.849790 2.900000 4.500 1.5 versicolor
## 59 6.400000 2.800000 5.600 1.8 virginica
## 60 6.400000 2.800000 5.600 2.1 virginica
## 61 5.000000 2.300000 3.300 1.1 versicolor
## 62 7.400000 2.800000 6.100 1.9 virginica
## 63 4.300000 3.000000 1.100 0.1 setosa
## 64 5.000000 3.300000 1.400 0.2 setosa
## 65 7.200000 3.000000 5.800 1.6 virginica
## 66 6.300000 2.500000 4.900 1.5 versicolor
## 67 5.100000 2.500000 4.500 1.1 versicolor
## 68 7.234040 3.200000 5.700 2.3 virginica
## 69 5.100000 3.500000 4.500 0.2 setosa
## 70 5.000000 3.500000 1.300 0.3 setosa
## 71 6.100000 3.000000 4.600 1.4 versicolor
## 72 6.900000 3.100000 5.100 2.3 virginica
## 73 5.100000 3.500000 1.400 0.3 setosa
## 74 6.500000 3.462121 4.600 1.5 versicolor
## 75 5.600000 2.800000 4.900 2.0 virginica
## 76 4.900000 2.500000 4.500 1.9 virginica
## 77 5.500000 3.500000 1.300 0.2 setosa
## 78 7.600000 3.000000 6.600 2.1 virginica
## 79 5.100000 3.800000 0.000 0.2 setosa
## 80 7.900000 3.800000 6.400 2.0 virginica
## 81 6.100000 2.600000 5.600 1.4 virginica
## 82 5.400000 3.400000 1.700 0.2 setosa
## 83 6.100000 2.900000 4.700 1.4 versicolor
## 84 5.400000 3.700000 1.500 0.2 setosa
## 85 6.700000 3.000000 5.200 2.3 virginica
## 86 5.100000 3.800000 1.900 0.3 setosa
## 87 6.400000 2.900000 4.300 1.3 versicolor
## 88 5.700000 2.900000 4.200 1.3 versicolor
## 89 4.400000 2.900000 1.400 0.2 setosa
## 90 6.300000 2.500000 5.000 1.9 virginica
## 91 7.200000 3.200000 6.000 1.8 virginica
## 92 4.900000 3.462121 3.300 1.0 versicolor
## 93 5.200000 3.400000 1.400 0.2 setosa
## 94 5.800000 2.700000 5.100 1.9 virginica
## 95 6.000000 2.200000 5.000 1.5 virginica
## 96 6.900000 3.100000 4.500 1.5 versicolor
## 97 5.500000 2.300000 4.000 1.3 versicolor
## 98 6.700000 3.462121 5.000 1.7 versicolor
## 99 5.700000 3.000000 4.200 1.2 versicolor
## 100 6.300000 2.800000 5.100 1.5 virginica
## 101 5.400000 3.400000 1.500 0.4 setosa
## 102 7.200000 3.600000 4.500 2.5 virginica
## 103 6.300000 2.700000 4.900 1.9 virginica
## 104 5.600000 3.000000 4.100 1.3 versicolor
## 105 5.100000 3.700000 4.500 0.4 setosa
## 106 5.500000 3.462121 0.925 1.0 versicolor
## 107 6.500000 3.000000 5.200 2.0 virginica
## 108 4.800000 3.000000 1.400 0.2 setosa
## 109 6.100000 2.800000 4.500 1.3 versicolor
## 110 4.600000 3.400000 1.400 0.3 setosa
## 111 6.300000 3.400000 4.500 2.4 virginica
## 112 5.000000 3.400000 1.500 0.2 setosa
## 113 5.100000 3.400000 1.500 0.2 setosa
## 114 7.187596 3.300000 5.700 2.1 virginica
## 115 6.700000 3.100000 4.700 1.5 versicolor
## 116 7.700000 2.600000 6.900 2.3 virginica
## 117 6.300000 3.462121 4.400 1.3 versicolor
## 118 4.600000 3.100000 1.500 0.2 setosa
## 119 6.712582 3.000000 5.500 2.1 virginica
## 120 5.429333 2.800000 4.700 1.2 versicolor
## 121 5.900000 3.000000 4.500 1.5 versicolor
## 122 4.500000 2.300000 1.300 0.3 setosa
## 123 6.400000 3.200000 5.300 2.3 virginica
## 124 5.200000 4.100000 1.500 0.1 setosa
## 125 49.000000 30.000000 14.000 2.0 setosa
## 126 5.600000 2.900000 3.600 1.3 versicolor
## 127 6.800000 3.200000 5.900 2.3 virginica
## 128 5.800000 3.462121 5.100 2.4 virginica
## 129 4.600000 3.600000 4.500 0.2 setosa
## 130 5.700000 3.462121 1.700 0.3 setosa
## 131 5.600000 2.500000 3.900 1.1 versicolor
## 132 6.700000 3.100000 4.400 1.4 versicolor
## 133 4.800000 3.462121 1.900 0.2 setosa
## 134 5.100000 3.300000 1.700 0.5 setosa
## 135 4.400000 3.000000 1.300 0.2 setosa
## 136 7.700000 3.000000 4.500 2.3 virginica
## 137 4.700000 3.200000 1.600 0.2 setosa
## 138 6.343972 3.000000 4.900 1.8 virginica
## 139 6.900000 3.100000 5.400 2.1 virginica
## 140 6.000000 2.200000 4.000 1.0 versicolor
## 141 5.000000 3.462121 1.400 0.2 setosa
## 142 5.500000 3.462121 3.800 1.1 versicolor
## 143 6.600000 3.000000 4.400 1.4 versicolor
## 144 6.300000 2.900000 5.600 1.8 virginica
## 145 5.700000 2.500000 5.000 2.0 virginica
## 146 6.700000 3.100000 5.600 2.4 virginica
## 147 5.600000 3.000000 4.500 1.5 versicolor
## 148 5.200000 3.500000 1.500 0.2 setosa
## 149 6.400000 3.100000 4.500 1.8 virginica
## 150 5.800000 2.600000 4.000 1.1 versicolor
## Petal.Width_imp
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 TRUE
## 8 FALSE
## 9 FALSE
## 10 FALSE
## 11 FALSE
## 12 FALSE
## 13 TRUE
## 14 FALSE
## 15 FALSE
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
## 21 FALSE
## 22 FALSE
## 23 FALSE
## 24 FALSE
## 25 FALSE
## 26 FALSE
## 27 FALSE
## 28 TRUE
## 29 FALSE
## 30 FALSE
## 31 FALSE
## 32 TRUE
## 33 FALSE
## 34 FALSE
## 35 FALSE
## 36 FALSE
## 37 FALSE
## 38 FALSE
## 39 FALSE
## 40 FALSE
## 41 FALSE
## 42 FALSE
## 43 FALSE
## 44 FALSE
## 45 FALSE
## 46 FALSE
## 47 FALSE
## 48 FALSE
## 49 FALSE
## 50 FALSE
## 51 FALSE
## 52 FALSE
## 53 FALSE
## 54 FALSE
## 55 FALSE
## 56 FALSE
## 57 FALSE
## 58 FALSE
## 59 TRUE
## 60 FALSE
## 61 TRUE
## 62 FALSE
## 63 FALSE
## 64 FALSE
## 65 FALSE
## 66 FALSE
## 67 FALSE
## 68 FALSE
## 69 TRUE
## 70 FALSE
## 71 FALSE
## 72 FALSE
## 73 FALSE
## 74 FALSE
## 75 FALSE
## 76 TRUE
## 77 FALSE
## 78 FALSE
## 79 FALSE
## 80 FALSE
## 81 FALSE
## 82 FALSE
## 83 FALSE
## 84 FALSE
## 85 FALSE
## 86 TRUE
## 87 FALSE
## 88 FALSE
## 89 FALSE
## 90 FALSE
## 91 FALSE
## 92 FALSE
## 93 FALSE
## 94 FALSE
## 95 FALSE
## 96 FALSE
## 97 FALSE
## 98 FALSE
## 99 FALSE
## 100 FALSE
## 101 FALSE
## 102 FALSE
## 103 TRUE
## 104 FALSE
## 105 FALSE
## 106 FALSE
## 107 FALSE
## 108 TRUE
## 109 FALSE
## 110 FALSE
## 111 FALSE
## 112 FALSE
## 113 FALSE
## 114 FALSE
## 115 FALSE
## 116 FALSE
## 117 FALSE
## 118 FALSE
## 119 FALSE
## 120 FALSE
## 121 FALSE
## 122 FALSE
## 123 FALSE
## 124 FALSE
## 125 FALSE
## 126 FALSE
## 127 FALSE
## 128 FALSE
## 129 FALSE
## 130 FALSE
## 131 FALSE
## 132 FALSE
## 133 FALSE
## 134 FALSE
## 135 TRUE
## 136 FALSE
## 137 FALSE
## 138 FALSE
## 139 FALSE
## 140 FALSE
## 141 FALSE
## 142 FALSE
## 143 FALSE
## 144 FALSE
## 145 FALSE
## 146 FALSE
## 147 FALSE
## 148 FALSE
## 149 FALSE
## 150 TRUE
cat("Imputation completed successfully!\n")
## Imputation completed successfully!