marks <- c(89, 95, 85, NA, 76, 69, 78, NA, 73, NA, 90, 69)

mean and median imputation

mean(marks, na.rm=TRUE)
[1] 80.44444
median(marks, na.rm = TRUE)
[1] 78
mean(marks[!is.na(marks)])
[1] 80.44444
pacman::p_load(Hmisc)
pacman::p_load(Hmisc)

Ratio

old_marks <- c(90, 100, 75, 89, 80, 77, 88, 69, 75, 88, 95, 70)
missing_location <- is.na(marks)
missing_location
 [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
sum(old_marks)
[1] 996
sum(marks, na.rm = TRUE)
[1] 724
pacman::p_load(Hmisc,VIM)
R <- sum(old_marks[!missing_location])/sum(marks, na.rm = TRUE)
R
[1] 1.035912
data("iris") # load the data
head(iris)
iris$Sepal.Length[1:10] <- NA
iris$Sepal.Length[1:10] <- NA
head(iris)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
model

Call:
lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)

Coefficients:
 (Intercept)   Sepal.Width  Petal.Length  
      2.2716        0.5931        0.4689  
I <- is.na(iris$Sepal.Length)

iris$Sepal.Length[I] <- predict(model,newdata = iris[I,])
head(iris,10)
data(iris)
for (i in 1:ncol(iris)) {
 iris[sample(nrow(iris), 10, replace = FALSE),i] <- NA
}

summary(iris)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :43  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.500   1st Qu.:0.250   versicolor:42  
 Median :5.800   Median :3.000   Median :4.300   Median :1.300   virginica :45  
 Mean   :5.861   Mean   :3.061   Mean   :3.728   Mean   :1.181   NA's      :20  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
 Max.   :7.900   Max.   :4.200   Max.   :6.900   Max.   :2.500                  
 NA's   :18      NA's   :19      NA's   :19      NA's   :19                     
iris2 <- kNN(iris)
summary(iris2)
  Sepal.Length    Sepal.Width    Petal.Length    Petal.Width        Species  
 Min.   :4.300   Min.   :2.00   Min.   :1.000   Min.   :0.1   setosa    :50  
 1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.600   1st Qu.:0.3   versicolor:50  
 Median :5.800   Median :3.00   Median :4.250   Median :1.3   virginica :50  
 Mean   :5.841   Mean   :3.06   Mean   :3.775   Mean   :1.2                  
 3rd Qu.:6.475   3rd Qu.:3.30   3rd Qu.:5.100   3rd Qu.:1.8                  
 Max.   :7.900   Max.   :4.20   Max.   :6.900   Max.   :2.5                  
 Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp Species_imp    
 Mode :logical    Mode :logical   Mode :logical    Mode :logical   Mode :logical  
 FALSE:132        FALSE:131       FALSE:131        FALSE:131       FALSE:130      
 TRUE :18         TRUE :19        TRUE :19         TRUE :19        TRUE :20       
                                                                                  
                                                                                  
                                                                                  
iris2 <- iris2[1:5]
iris2 <- subset(iris2, select=Sepal.Leng)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
[1] 19
num_complete_cases <- sum(complete.cases(dirty_iris))
percent_complete_cases <- (num_complete_cases / nrow(dirty_iris)) * 100
cat("Number of complete observations:", num_complete_cases, "\n")
Number of complete observations: 96 
cat("Percentage of complete observations:", percent_complete_cases, "%\n")
Percentage of complete observations: 64 %
special_values <- sapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    list(
      Inf_values = sum(is.infinite(x)),
      NaN_values = sum(is.nan(x))
    )
  } else {
    NULL
  }
})
print(special_values)
$Sepal.Length
$Sepal.Length$Inf_values
[1] 0

$Sepal.Length$NaN_values
[1] 0


$Sepal.Width
$Sepal.Width$Inf_values
[1] 0

$Sepal.Width$NaN_values
[1] 0


$Petal.Length
$Petal.Length$Inf_values
[1] 0

$Petal.Length$NaN_values
[1] 0


$Petal.Width
$Petal.Width$Inf_values
[1] 1

$Petal.Width$NaN_values
[1] 0


$Species
NULL
dirty_iris[] <- lapply(dirty_iris, function(x) {
  if (is.numeric(x)) {
    x[is.infinite(x)] <- NA  # Replace Inf and -Inf with NA
    x[is.nan(x)] <- NA        # Replace NaN with NA
  }
  return(x)
})
sapply(dirty_iris, function(x) sum(is.na(x)))
Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
          10           17           19           13            0 
invalid_sepal_width <- dirty_iris$Sepal.Width <= 0
invalid_sepal_length <- dirty_iris$Sepal.Length > 30
violating_rows <- which(invalid_sepal_width | invalid_sepal_length)
num_violations <- length(violating_rows)
cat("Number of observations violating the rules:", num_violations, "\n")
Number of observations violating the rules: 4 
dirty_iris[violating_rows, ]
NA
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

model_sepal_length <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)

dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predict(model_sepal_length, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9Cm1hcmtzIDwtIGMoODksIDk1LCA4NSwgTkEsIDc2LCA2OSwgNzgsIE5BLCA3MywgTkEsIDkwLCA2OSkKYGBgCiMgbWVhbiBhbmQgbWVkaWFuIGltcHV0YXRpb24KYGBge3J9Cm1hcmtzMVshaXMubmEobWFya3MxKV0gCgptZWFuKG1hcmtzLCBuYS5ybT1UUlVFKQptZWRpYW4obWFya3MsIG5hLnJtID0gVFJVRSkKCm1lYW4obWFya3NbIWlzLm5hKG1hcmtzKV0pCmBgYAoKYGBge3J9CnBhY21hbjo6cF9sb2FkKEhtaXNjKQpgYGAKCmBgYHtyfQpwYWNtYW46OnBfbG9hZChIbWlzYykKYGBgCiMgUmF0aW8KYGBge3J9Cm9sZF9tYXJrcyA8LSBjKDkwLCAxMDAsIDc1LCA4OSwgODAsIDc3LCA4OCwgNjksIDc1LCA4OCwgOTUsIDcwKQptYXJrcyA8LSBjKDg5LCA5NSwgODUsIE5BLCA3NiwgNjksIDc4LCBOQSwgNzMsIE5BLCA5MCwgNjkpCmBgYAoKYGBge3J9Cm1pc3NpbmdfbG9jYXRpb24gPC0gaXMubmEobWFya3MpCm1pc3NpbmdfbG9jYXRpb24KYGBgCmBgYHtyfQpzdW0ob2xkX21hcmtzKQpgYGAKYGBge3J9CnN1bShtYXJrcywgbmEucm0gPSBUUlVFKQpgYGAKYGBge3J9CnBhY21hbjo6cF9sb2FkKEhtaXNjLFZJTSkKYGBgCgpgYGB7cn0KUiA8LSBzdW0ob2xkX21hcmtzWyFtaXNzaW5nX2xvY2F0aW9uXSkvc3VtKG1hcmtzLCBuYS5ybSA9IFRSVUUpClIKYGBgCmBgYHtyfQpkYXRhKCJpcmlzIikgIyBsb2FkIHRoZSBkYXRhCmhlYWQoaXJpcykKYGBgCmBgYHtyfQppcmlzJFNlcGFsLkxlbmd0aFsxOjEwXSA8LSBOQQppcmlzJFNlcGFsLkxlbmd0aFsxOjEwXSA8LSBOQQpoZWFkKGlyaXMpCmBgYApgYGB7cn0KbW9kZWwgPC0gbG0oU2VwYWwuTGVuZ3RoIH4gU2VwYWwuV2lkdGggKyBQZXRhbC5MZW5ndGgsIGRhdGEgPSBpcmlzKQptb2RlbApgYGAKYGBge3J9CkkgPC0gaXMubmEoaXJpcyRTZXBhbC5MZW5ndGgpCgppcmlzJFNlcGFsLkxlbmd0aFtJXSA8LSBwcmVkaWN0KG1vZGVsLG5ld2RhdGEgPSBpcmlzW0ksXSkKaGVhZChpcmlzLDEwKQpgYGAKCmBgYHtyfQpkYXRhKGlyaXMpCmBgYAoKYGBge3J9CmZvciAoaSBpbiAxOm5jb2woaXJpcykpIHsKIGlyaXNbc2FtcGxlKG5yb3coaXJpcyksIDEwLCByZXBsYWNlID0gRkFMU0UpLGldIDwtIE5BCn0KCnN1bW1hcnkoaXJpcykKYGBgCgpgYGB7cn0KaXJpczIgPC0ga05OKGlyaXMpCnN1bW1hcnkoaXJpczIpCmBgYApgYGB7cn0KaXJpczIgPC0gaXJpczJbMTo1XQppcmlzMiA8LSBzdWJzZXQoaXJpczIsIHNlbGVjdD1TZXBhbC5MZW5nKQpgYGAKCmBgYHtyfQpkaXJ0eV9pcmlzIDwtIHJlYWQuY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vZWR3aW5kai9kYXRhY2xlYW5pbmcvbWFzdGVyL2RhdGEvZGlydHlfaXJpcy5jc3YiKQpgYGAKCmBgYHtyfQpzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQpgYGAKCmBgYHtyfQpudW1fY29tcGxldGVfY2FzZXMgPC0gc3VtKGNvbXBsZXRlLmNhc2VzKGRpcnR5X2lyaXMpKQpwZXJjZW50X2NvbXBsZXRlX2Nhc2VzIDwtIChudW1fY29tcGxldGVfY2FzZXMgLyBucm93KGRpcnR5X2lyaXMpKSAqIDEwMApjYXQoIk51bWJlciBvZiBjb21wbGV0ZSBvYnNlcnZhdGlvbnM6IiwgbnVtX2NvbXBsZXRlX2Nhc2VzLCAiXG4iKQpjYXQoIlBlcmNlbnRhZ2Ugb2YgY29tcGxldGUgb2JzZXJ2YXRpb25zOiIsIHBlcmNlbnRfY29tcGxldGVfY2FzZXMsICIlXG4iKQpgYGAKCmBgYHtyfQpzcGVjaWFsX3ZhbHVlcyA8LSBzYXBwbHkoZGlydHlfaXJpcywgZnVuY3Rpb24oeCkgewogIGlmIChpcy5udW1lcmljKHgpKSB7CiAgICBsaXN0KAogICAgICBJbmZfdmFsdWVzID0gc3VtKGlzLmluZmluaXRlKHgpKSwKICAgICAgTmFOX3ZhbHVlcyA9IHN1bShpcy5uYW4oeCkpCiAgICApCiAgfSBlbHNlIHsKICAgIE5VTEwKICB9Cn0pCnByaW50KHNwZWNpYWxfdmFsdWVzKQpgYGAKCmBgYHtyfQpkaXJ0eV9pcmlzW10gPC0gbGFwcGx5KGRpcnR5X2lyaXMsIGZ1bmN0aW9uKHgpIHsKICBpZiAoaXMubnVtZXJpYyh4KSkgewogICAgeFtpcy5pbmZpbml0ZSh4KV0gPC0gTkEgICMgUmVwbGFjZSBJbmYgYW5kIC1JbmYgd2l0aCBOQQogICAgeFtpcy5uYW4oeCldIDwtIE5BICAgICAgICAjIFJlcGxhY2UgTmFOIHdpdGggTkEKICB9CiAgcmV0dXJuKHgpCn0pCnNhcHBseShkaXJ0eV9pcmlzLCBmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQoKCmBgYApgYGB7cn0KaW52YWxpZF9zZXBhbF93aWR0aCA8LSBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDAKaW52YWxpZF9zZXBhbF9sZW5ndGggPC0gZGlydHlfaXJpcyRTZXBhbC5MZW5ndGggPiAzMAp2aW9sYXRpbmdfcm93cyA8LSB3aGljaChpbnZhbGlkX3NlcGFsX3dpZHRoIHwgaW52YWxpZF9zZXBhbF9sZW5ndGgpCm51bV92aW9sYXRpb25zIDwtIGxlbmd0aCh2aW9sYXRpbmdfcm93cykKY2F0KCJOdW1iZXIgb2Ygb2JzZXJ2YXRpb25zIHZpb2xhdGluZyB0aGUgcnVsZXM6IiwgbnVtX3Zpb2xhdGlvbnMsICJcbiIpCmRpcnR5X2lyaXNbdmlvbGF0aW5nX3Jvd3MsIF0KCmBgYAoKYGBge3J9CmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpcyRTZXBhbC5XaWR0aCwgbmEucm0gPSBUUlVFKQoKZGlydHlfaXJpcyRQZXRhbC5MZW5ndGhbaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpXSA8LSBtZWRpYW4oZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgsIG5hLnJtID0gVFJVRSkKCm1vZGVsX3NlcGFsX2xlbmd0aCA8LSBsbShTZXBhbC5MZW5ndGggfiBTZXBhbC5XaWR0aCArIFBldGFsLkxlbmd0aCArIFBldGFsLldpZHRoLCBkYXRhID0gZGlydHlfaXJpcykKCmRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoW2lzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKV0gPC0gcHJlZGljdChtb2RlbF9zZXBhbF9sZW5ndGgsIG5ld2RhdGEgPSBkaXJ0eV9pcmlzW2lzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKSwgXSkKCgpgYGAKCg==