Question 1

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

missing_values <- sum(is.na(dirty_iris$Petal.Length))

print(paste("Number of missing values in Petal.Length:", missing_values))
## [1] "Number of missing values in Petal.Length: 19"

Question 2

complete_obs <- sum(complete.cases(dirty_iris))
total_obs <- nrow(dirty_iris) 
percentage_complete <- (complete_obs / total_obs) * 100
print(paste("Number of complete observations:", complete_obs))
## [1] "Number of complete observations: 96"
print(paste("Percentage of complete observations:", round(percentage_complete, 2), "%"))
## [1] "Percentage of complete observations: 64 %"

Question 3

# Select numeric columns
numeric_columns <- sapply(dirty_iris, is.numeric)

# Count missing values (NA)
missing_values <- colSums(is.na(dirty_iris[, numeric_columns]))

# Count Inf and -Inf values
infinite_values <- colSums(sapply(dirty_iris[, numeric_columns], is.infinite))

# Count NaN values
nan_values <- colSums(sapply(dirty_iris[, numeric_columns], is.nan))

# Display results
print("Missing values (NA):")
## [1] "Missing values (NA):"
print(missing_values)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##           10           17           19           12
print("Infinite values (Inf/-Inf):")
## [1] "Infinite values (Inf/-Inf):"
print(infinite_values)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            1
print("NaN values:")
## [1] "NaN values:"
print(nan_values)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            0            0            0            0
print("Unique values in numeric columns:")
## [1] "Unique values in numeric columns:"
sapply(dirty_iris[, numeric_columns], unique)
## $Sepal.Length
##  [1]  6.4  6.3  6.2  5.0  5.7  5.3  5.9  5.8  4.8  6.0  6.8   NA  5.5  4.7  5.6
## [16]  4.9  5.4  6.7  4.6 73.0  6.5  4.4  6.6  0.0  7.7  7.0  5.2  5.1  7.4  4.3
## [31]  7.2  6.1  6.9  7.6  7.9  4.5 49.0
## 
## $Sepal.Width
##  [1]  3.2  3.3   NA  3.4  2.6  2.7  3.0  3.1  3.5  2.8  3.9 -3.0  4.0  3.6 29.0
## [16]  2.9  2.2  2.5  4.2  3.8  2.3  3.7  4.1 30.0  0.0
## 
## $Petal.Length
##  [1]  4.500  6.000  5.400  1.600  3.500     NA  5.300  5.100  4.100  4.800
## [11]  1.700  4.000  1.300  4.200  5.700  5.900  1.400  1.500 63.000  0.820
## [21] 23.000  5.500  5.800  1.200  3.900  6.700  4.700  4.400  5.600  3.300
## [31]  6.100  1.100  4.900  4.600  6.600  0.000  6.400  5.200  1.900  4.300
## [41]  5.000  0.925  6.900 14.000  3.600  3.800
## 
## $Petal.Width
##  [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 0.6 1.6 1.4 1.3 0.1 2.1 2.0 1.2 1.9 2.2 0.3
## [20] 1.1 Inf 1.7 2.4 0.5

Question 4

# Read the dataset
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

# Locate the 'Inf' values in numeric columns
inf_locations <- sapply(dirty_iris, function(x) any(is.infinite(x)))

# Print which columns contain Inf values
print("Columns containing Inf values:")
## [1] "Columns containing Inf values:"
print(names(dirty_iris)[inf_locations])
## [1] "Petal.Width"
# Replace Inf with NA in the entire dataset
dirty_iris[dirty_iris == Inf] <- NA

# Verify replacement (checking Petal.Width column specifically)
print("Updated Petal.Width column:")
## [1] "Updated Petal.Width column:"
print(dirty_iris$Petal.Width)
##   [1] 1.5 2.5 2.3 0.4 1.0 0.2  NA 1.8 1.0 0.2 0.6 1.6  NA 1.4 0.4 1.0 1.3 0.2
##  [19] 0.2 1.3 0.1 1.5 1.8 2.5 2.1 0.2 0.1  NA 2.0 1.3 0.2  NA 1.3 1.3 1.3 0.1
##  [37] 1.8 1.5 1.8 0.2 0.2 1.2 0.4 1.9 0.2 2.0 0.4 1.4 2.2 1.6 1.2 0.2 1.4 0.2
##  [55] 1.6 2.2 0.3 1.5  NA 2.1  NA 1.9 0.1 0.2 1.6 1.5 1.1 2.3  NA 0.3 1.4 2.3
##  [73] 0.3 1.5 2.0  NA 0.2 2.1 0.2 2.0 1.4 0.2 1.4 0.2 2.3  NA 1.3 1.3 0.2 1.9
##  [91] 1.8 1.0 0.2 1.9 1.5 1.5 1.3 1.7 1.2 1.5 0.4 2.5  NA 1.3 0.4 1.0 2.0  NA
## [109] 1.3 0.3 2.4 0.2 0.2 2.1 1.5 2.3 1.3 0.2 2.1 1.2 1.5 0.3 2.3 0.1 2.0 1.3
## [127] 2.3 2.4 0.2 0.3 1.1 1.4 0.2 0.5  NA 2.3 0.2 1.8 2.1 1.0 0.2 1.1 1.4 1.8
## [145] 2.0 2.4 1.5 0.2 1.8  NA

Question 5

violations <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30, ]

print(violations)
##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>
num_violations <- nrow(violations)
cat("Number of observations violating the rules:", num_violations, "\n")
## Number of observations violating the rules: 31

Question 6

invalid_sepal_width <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)

print(dirty_iris[invalid_sepal_width, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa
dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- 
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])

dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA

print(dirty_iris[invalid_sepal_width, ])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa
cat("Error correction completed. Sepal.Width rule violations have been fixed.\n")
## Error correction completed. Sepal.Width rule violations have been fixed.

Question 7

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

lm_model <- lm(Sepal.Length ~ Petal.Length + Petal.Width + Sepal.Width, data = dirty_iris, na.action = na.exclude)

predicted_values <- predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_values

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5)

print(dirty_iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1       6.400000    3.200000        4.500         1.5 versicolor
## 2       6.300000    3.300000        6.000         2.5  virginica
## 3       6.200000    3.462121        5.400         2.3  virginica
## 4       5.000000    3.400000        1.600         0.4     setosa
## 5       5.700000    2.600000        3.500         1.0 versicolor
## 6       5.300000    3.462121        4.500         0.2     setosa
## 7       6.400000    2.700000        5.300         1.9  virginica
## 8       5.900000    3.000000        5.100         1.8  virginica
## 9       5.800000    2.700000        4.100         1.0 versicolor
## 10      4.800000    3.100000        1.600         0.2     setosa
## 11      5.000000    3.500000        1.600         0.6     setosa
## 12      6.000000    2.700000        5.100         1.6 versicolor
## 13      6.000000    3.000000        4.800         1.8  virginica
## 14      6.800000    2.800000        4.800         1.4 versicolor
## 15      5.925959    3.900000        1.700         0.4     setosa
## 16      5.000000    3.000000        3.500         1.0 versicolor
## 17      5.500000    3.462121        4.000         1.3 versicolor
## 18      4.700000    3.200000        1.300         0.2     setosa
## 19      6.223562    4.000000        4.500         0.2     setosa
## 20      5.600000    3.462121        4.200         1.3 versicolor
## 21      4.900000    3.600000        4.500         0.1     setosa
## 22      5.400000    3.462121        4.500         1.5 versicolor
## 23      6.200000    2.800000        4.500         1.8  virginica
## 24      6.700000    3.300000        5.700         2.5  virginica
## 25      6.761731    3.000000        5.900         2.1  virginica
## 26      4.600000    3.200000        1.400         0.2     setosa
## 27      4.900000    3.100000        1.500         0.1     setosa
## 28     73.000000   29.000000       63.000         2.0  virginica
## 29      6.500000    3.200000        5.100         2.0  virginica
## 30      5.050877    2.800000        0.820         1.3 versicolor
## 31      4.400000    3.200000        4.500         0.2     setosa
## 32      5.900000    3.200000        4.800         1.5 versicolor
## 33      5.700000    2.800000        4.500         1.3 versicolor
## 34      6.200000    2.900000        4.500         1.3 versicolor
## 35      6.600000    2.900000       23.000         1.3 versicolor
## 36      4.800000    3.000000        1.400         0.1     setosa
## 37      6.500000    3.000000        5.500         1.8  virginica
## 38      6.200000    2.200000        4.500         1.5 versicolor
## 39      6.700000    2.500000        5.800         1.8  virginica
## 40      5.000000    3.000000        1.600         0.2     setosa
## 41      5.000000    3.462121        1.200         0.2     setosa
## 42      5.800000    2.700000        3.900         1.2 versicolor
## 43      0.000000    3.462121        1.300         0.4     setosa
## 44      5.800000    2.700000        5.100         1.9  virginica
## 45      5.500000    4.200000        1.400         0.2     setosa
## 46      7.700000    2.800000        6.700         2.0  virginica
## 47      5.700000    3.462121        4.500         0.4     setosa
## 48      7.000000    3.200000        4.700         1.4 versicolor
## 49      6.500000    3.000000        5.800         2.2  virginica
## 50      6.000000    3.400000        4.500         1.6 versicolor
## 51      5.500000    2.600000        4.400         1.2 versicolor
## 52      4.900000    3.100000        4.500         0.2     setosa
## 53      5.200000    2.700000        3.900         1.4 versicolor
## 54      4.800000    3.400000        1.600         0.2     setosa
## 55      6.300000    3.300000        4.700         1.6 versicolor
## 56      7.700000    3.800000        6.700         2.2  virginica
## 57      5.100000    3.800000        1.500         0.3     setosa
## 58      5.849790    2.900000        4.500         1.5 versicolor
## 59      6.400000    2.800000        5.600         1.8  virginica
## 60      6.400000    2.800000        5.600         2.1  virginica
## 61      5.000000    2.300000        3.300         1.1 versicolor
## 62      7.400000    2.800000        6.100         1.9  virginica
## 63      4.300000    3.000000        1.100         0.1     setosa
## 64      5.000000    3.300000        1.400         0.2     setosa
## 65      7.200000    3.000000        5.800         1.6  virginica
## 66      6.300000    2.500000        4.900         1.5 versicolor
## 67      5.100000    2.500000        4.500         1.1 versicolor
## 68      7.234040    3.200000        5.700         2.3  virginica
## 69      5.100000    3.500000        4.500         0.2     setosa
## 70      5.000000    3.500000        1.300         0.3     setosa
## 71      6.100000    3.000000        4.600         1.4 versicolor
## 72      6.900000    3.100000        5.100         2.3  virginica
## 73      5.100000    3.500000        1.400         0.3     setosa
## 74      6.500000    3.462121        4.600         1.5 versicolor
## 75      5.600000    2.800000        4.900         2.0  virginica
## 76      4.900000    2.500000        4.500         1.9  virginica
## 77      5.500000    3.500000        1.300         0.2     setosa
## 78      7.600000    3.000000        6.600         2.1  virginica
## 79      5.100000    3.800000        0.000         0.2     setosa
## 80      7.900000    3.800000        6.400         2.0  virginica
## 81      6.100000    2.600000        5.600         1.4  virginica
## 82      5.400000    3.400000        1.700         0.2     setosa
## 83      6.100000    2.900000        4.700         1.4 versicolor
## 84      5.400000    3.700000        1.500         0.2     setosa
## 85      6.700000    3.000000        5.200         2.3  virginica
## 86      5.100000    3.800000        1.900         0.3     setosa
## 87      6.400000    2.900000        4.300         1.3 versicolor
## 88      5.700000    2.900000        4.200         1.3 versicolor
## 89      4.400000    2.900000        1.400         0.2     setosa
## 90      6.300000    2.500000        5.000         1.9  virginica
## 91      7.200000    3.200000        6.000         1.8  virginica
## 92      4.900000    3.462121        3.300         1.0 versicolor
## 93      5.200000    3.400000        1.400         0.2     setosa
## 94      5.800000    2.700000        5.100         1.9  virginica
## 95      6.000000    2.200000        5.000         1.5  virginica
## 96      6.900000    3.100000        4.500         1.5 versicolor
## 97      5.500000    2.300000        4.000         1.3 versicolor
## 98      6.700000    3.462121        5.000         1.7 versicolor
## 99      5.700000    3.000000        4.200         1.2 versicolor
## 100     6.300000    2.800000        5.100         1.5  virginica
## 101     5.400000    3.400000        1.500         0.4     setosa
## 102     7.200000    3.600000        4.500         2.5  virginica
## 103     6.300000    2.700000        4.900         1.9  virginica
## 104     5.600000    3.000000        4.100         1.3 versicolor
## 105     5.100000    3.700000        4.500         0.4     setosa
## 106     5.500000    3.462121        0.925         1.0 versicolor
## 107     6.500000    3.000000        5.200         2.0  virginica
## 108     4.800000    3.000000        1.400         0.2     setosa
## 109     6.100000    2.800000        4.500         1.3 versicolor
## 110     4.600000    3.400000        1.400         0.3     setosa
## 111     6.300000    3.400000        4.500         2.4  virginica
## 112     5.000000    3.400000        1.500         0.2     setosa
## 113     5.100000    3.400000        1.500         0.2     setosa
## 114     7.187596    3.300000        5.700         2.1  virginica
## 115     6.700000    3.100000        4.700         1.5 versicolor
## 116     7.700000    2.600000        6.900         2.3  virginica
## 117     6.300000    3.462121        4.400         1.3 versicolor
## 118     4.600000    3.100000        1.500         0.2     setosa
## 119     6.712582    3.000000        5.500         2.1  virginica
## 120     5.429333    2.800000        4.700         1.2 versicolor
## 121     5.900000    3.000000        4.500         1.5 versicolor
## 122     4.500000    2.300000        1.300         0.3     setosa
## 123     6.400000    3.200000        5.300         2.3  virginica
## 124     5.200000    4.100000        1.500         0.1     setosa
## 125    49.000000   30.000000       14.000         2.0     setosa
## 126     5.600000    2.900000        3.600         1.3 versicolor
## 127     6.800000    3.200000        5.900         2.3  virginica
## 128     5.800000    3.462121        5.100         2.4  virginica
## 129     4.600000    3.600000        4.500         0.2     setosa
## 130     5.700000    3.462121        1.700         0.3     setosa
## 131     5.600000    2.500000        3.900         1.1 versicolor
## 132     6.700000    3.100000        4.400         1.4 versicolor
## 133     4.800000    3.462121        1.900         0.2     setosa
## 134     5.100000    3.300000        1.700         0.5     setosa
## 135     4.400000    3.000000        1.300         0.2     setosa
## 136     7.700000    3.000000        4.500         2.3  virginica
## 137     4.700000    3.200000        1.600         0.2     setosa
## 138     6.343972    3.000000        4.900         1.8  virginica
## 139     6.900000    3.100000        5.400         2.1  virginica
## 140     6.000000    2.200000        4.000         1.0 versicolor
## 141     5.000000    3.462121        1.400         0.2     setosa
## 142     5.500000    3.462121        3.800         1.1 versicolor
## 143     6.600000    3.000000        4.400         1.4 versicolor
## 144     6.300000    2.900000        5.600         1.8  virginica
## 145     5.700000    2.500000        5.000         2.0  virginica
## 146     6.700000    3.100000        5.600         2.4  virginica
## 147     5.600000    3.000000        4.500         1.5 versicolor
## 148     5.200000    3.500000        1.500         0.2     setosa
## 149     6.400000    3.100000        4.500         1.8  virginica
## 150     5.800000    2.600000        4.000         1.1 versicolor
##     Petal.Width_imp
## 1             FALSE
## 2             FALSE
## 3             FALSE
## 4             FALSE
## 5             FALSE
## 6             FALSE
## 7              TRUE
## 8             FALSE
## 9             FALSE
## 10            FALSE
## 11            FALSE
## 12            FALSE
## 13             TRUE
## 14            FALSE
## 15            FALSE
## 16            FALSE
## 17            FALSE
## 18            FALSE
## 19            FALSE
## 20            FALSE
## 21            FALSE
## 22            FALSE
## 23            FALSE
## 24            FALSE
## 25            FALSE
## 26            FALSE
## 27            FALSE
## 28             TRUE
## 29            FALSE
## 30            FALSE
## 31            FALSE
## 32             TRUE
## 33            FALSE
## 34            FALSE
## 35            FALSE
## 36            FALSE
## 37            FALSE
## 38            FALSE
## 39            FALSE
## 40            FALSE
## 41            FALSE
## 42            FALSE
## 43            FALSE
## 44            FALSE
## 45            FALSE
## 46            FALSE
## 47            FALSE
## 48            FALSE
## 49            FALSE
## 50            FALSE
## 51            FALSE
## 52            FALSE
## 53            FALSE
## 54            FALSE
## 55            FALSE
## 56            FALSE
## 57            FALSE
## 58            FALSE
## 59             TRUE
## 60            FALSE
## 61             TRUE
## 62            FALSE
## 63            FALSE
## 64            FALSE
## 65            FALSE
## 66            FALSE
## 67            FALSE
## 68            FALSE
## 69             TRUE
## 70            FALSE
## 71            FALSE
## 72            FALSE
## 73            FALSE
## 74            FALSE
## 75            FALSE
## 76             TRUE
## 77            FALSE
## 78            FALSE
## 79            FALSE
## 80            FALSE
## 81            FALSE
## 82            FALSE
## 83            FALSE
## 84            FALSE
## 85            FALSE
## 86             TRUE
## 87            FALSE
## 88            FALSE
## 89            FALSE
## 90            FALSE
## 91            FALSE
## 92            FALSE
## 93            FALSE
## 94            FALSE
## 95            FALSE
## 96            FALSE
## 97            FALSE
## 98            FALSE
## 99            FALSE
## 100           FALSE
## 101           FALSE
## 102           FALSE
## 103            TRUE
## 104           FALSE
## 105           FALSE
## 106           FALSE
## 107           FALSE
## 108            TRUE
## 109           FALSE
## 110           FALSE
## 111           FALSE
## 112           FALSE
## 113           FALSE
## 114           FALSE
## 115           FALSE
## 116           FALSE
## 117           FALSE
## 118           FALSE
## 119           FALSE
## 120           FALSE
## 121           FALSE
## 122           FALSE
## 123           FALSE
## 124           FALSE
## 125           FALSE
## 126           FALSE
## 127           FALSE
## 128           FALSE
## 129           FALSE
## 130           FALSE
## 131           FALSE
## 132           FALSE
## 133           FALSE
## 134           FALSE
## 135            TRUE
## 136           FALSE
## 137           FALSE
## 138           FALSE
## 139           FALSE
## 140           FALSE
## 141           FALSE
## 142           FALSE
## 143           FALSE
## 144           FALSE
## 145           FALSE
## 146           FALSE
## 147           FALSE
## 148           FALSE
## 149           FALSE
## 150            TRUE
cat("Imputation completed successfully!\n")
## Imputation completed successfully!