dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
[1] 19
num_complete_cases <- sum(complete.cases(dirty_iris))
percent_complete_cases <- (num_complete_cases / nrow(dirty_iris)) * 100
cat("Number of complete observations:", num_complete_cases, "\n")
Number of complete observations: 96 
cat("Percentage of complete observations:", percent_complete_cases, "%\n")
Percentage of complete observations: 64 %
special_values <- sapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    list(
      Inf_values = sum(is.infinite(x)),
      NaN_values = sum(is.nan(x))
    )
  } else {
    NULL
  }
})
print(special_values)
$Sepal.Length
$Sepal.Length$Inf_values
[1] 0

$Sepal.Length$NaN_values
[1] 0


$Sepal.Width
$Sepal.Width$Inf_values
[1] 0

$Sepal.Width$NaN_values
[1] 0


$Petal.Length
$Petal.Length$Inf_values
[1] 0

$Petal.Length$NaN_values
[1] 0


$Petal.Width
$Petal.Width$Inf_values
[1] 1

$Petal.Width$NaN_values
[1] 0


$Species
NULL
dirty_iris[] <- lapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    x[is.infinite(x)] <- NA  # Replace Inf and -Inf with NA
    x[is.nan(x)] <- NA        # Replace NaN with NA
  }
  return(x)
})
sapply(dirty_iris, function(x) sum(is.na(x)))
Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
          10           17           19           13            0 
invalid_sepal_width <- dirty_iris$Sepal.Width <= 0
invalid_sepal_length <- dirty_iris$Sepal.Length > 30
violating_rows <- which(invalid_sepal_width | invalid_sepal_length)
num_violations <- length(violating_rows)
cat("Number of observations violating the rules:", num_violations, "\n")
Number of observations violating the rules: 4 
dirty_iris[violating_rows, ]
NA
[1] NA
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

model_sepal_length <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predict(model_sepal_length, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQpgYGB7cn0KZGlydHlfaXJpcyA8LSByZWFkLmNzdigiaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL2Vkd2luZGovZGF0YWNsZWFuaW5nL21hc3Rlci9kYXRhL2RpcnR5X2lyaXMuY3N2IikKYGBgCgpgYGB7cn0Kc3VtKGlzLm5hKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoKSkKYGBgCgpgYGB7cn0KbnVtX2NvbXBsZXRlX2Nhc2VzIDwtIHN1bShjb21wbGV0ZS5jYXNlcyhkaXJ0eV9pcmlzKSkKcGVyY2VudF9jb21wbGV0ZV9jYXNlcyA8LSAobnVtX2NvbXBsZXRlX2Nhc2VzIC8gbnJvdyhkaXJ0eV9pcmlzKSkgKiAxMDAKY2F0KCJOdW1iZXIgb2YgY29tcGxldGUgb2JzZXJ2YXRpb25zOiIsIG51bV9jb21wbGV0ZV9jYXNlcywgIlxuIikKYGBgCgpgYGB7cn0KY2F0KCJQZXJjZW50YWdlIG9mIGNvbXBsZXRlIG9ic2VydmF0aW9uczoiLCBwZXJjZW50X2NvbXBsZXRlX2Nhc2VzLCAiJVxuIikKYGBgCgpgYGB7cn0Kc3BlY2lhbF92YWx1ZXMgPC0gc2FwcGx5KGRpcnR5X2lyaXMsIGZ1bmN0aW9uKHgpIHsKICBpZiAoaXMubnVtZXJpYyh4KSkgewogICAgbGlzdCgKICAgICAgSW5mX3ZhbHVlcyA9IHN1bShpcy5pbmZpbml0ZSh4KSksCiAgICAgIE5hTl92YWx1ZXMgPSBzdW0oaXMubmFuKHgpKQogICAgKQogIH0gZWxzZSB7CiAgICBOVUxMCiAgfQp9KQpwcmludChzcGVjaWFsX3ZhbHVlcykKYGBgCgpgYGB7cn0KZGlydHlfaXJpc1tdIDwtIGxhcHBseShkaXJ0eV9pcmlzLCBmdW5jdGlvbih4KSB7CiAgaWYgKGlzLm51bWVyaWMoeCkpIHsKICAgIHhbaXMuaW5maW5pdGUoeCldIDwtIE5BICAjIFJlcGxhY2UgSW5mIGFuZCAtSW5mIHdpdGggTkEKICAgIHhbaXMubmFuKHgpXSA8LSBOQSAgICAgICAgIyBSZXBsYWNlIE5hTiB3aXRoIE5BCiAgfQogIHJldHVybih4KQp9KQpzYXBwbHkoZGlydHlfaXJpcywgZnVuY3Rpb24oeCkgc3VtKGlzLm5hKHgpKSkKYGBgCgpgYGB7cn0KaW52YWxpZF9zZXBhbF93aWR0aCA8LSBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDAKaW52YWxpZF9zZXBhbF9sZW5ndGggPC0gZGlydHlfaXJpcyRTZXBhbC5MZW5ndGggPiAzMAp2aW9sYXRpbmdfcm93cyA8LSB3aGljaChpbnZhbGlkX3NlcGFsX3dpZHRoIHwgaW52YWxpZF9zZXBhbF9sZW5ndGgpCm51bV92aW9sYXRpb25zIDwtIGxlbmd0aCh2aW9sYXRpbmdfcm93cykKY2F0KCJOdW1iZXIgb2Ygb2JzZXJ2YXRpb25zIHZpb2xhdGluZyB0aGUgcnVsZXM6IiwgbnVtX3Zpb2xhdGlvbnMsICJcbiIpCmBgYAoKYGBge3J9CmRpcnR5X2lyaXNbdmlvbGF0aW5nX3Jvd3MsIF0KYGBgCgpgYGB7cn0KTkEKYGBgCgpgYGB7cn0KZGlydHlfaXJpcyRTZXBhbC5XaWR0aFtpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKV0gPC0gbWVhbihkaXJ0eV9pcmlzJFNlcGFsLldpZHRoLCBuYS5ybSA9IFRSVUUpCgpkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aFtpcy5uYShkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCldIDwtIG1lZGlhbihkaXJ0eV9pcmlzJFBldGFsLkxlbmd0aCwgbmEucm0gPSBUUlVFKQoKbW9kZWxfc2VwYWxfbGVuZ3RoIDwtIGxtKFNlcGFsLkxlbmd0aCB+IFNlcGFsLldpZHRoICsgUGV0YWwuTGVuZ3RoICsgUGV0YWwuV2lkdGgsIGRhdGEgPSBkaXJ0eV9pcmlzKQoKZGlydHlfaXJpcyRTZXBhbC5MZW5ndGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5MZW5ndGgpXSA8LSBwcmVkaWN0KG1vZGVsX3NlcGFsX2xlbmd0aCwgbmV3ZGF0YSA9IGRpcnR5X2lyaXNbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5MZW5ndGgpLCBdKQpgYGAKCg==