Ratio
old_marks <- c(90, 100, 75, 89, 80, 77, 88, 69, 75, 88, 95, 70)
missing_location <- is.na(marks)
missing_location
[1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
sum(old_marks)
[1] 996
sum(marks, na.rm = TRUE)
[1] 724
pacman::p_load(Hmisc,VIM)
R <- sum(old_marks[!missing_location])/sum(marks, na.rm = TRUE)
R
[1] 1.035912
data("iris") # load the data
head(iris)
iris$Sepal.Length[1:10] <- NA
iris$Sepal.Length[1:10] <- NA
head(iris)
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
model
Call:
lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
Coefficients:
(Intercept) Sepal.Width Petal.Length
2.2716 0.5931 0.4689
I <- is.na(iris$Sepal.Length)
iris$Sepal.Length[I] <- predict(model,newdata = iris[I,])
head(iris,10)
data(iris)
for (i in 1:ncol(iris)) {
iris[sample(nrow(iris), 10, replace = FALSE),i] <- NA
}
summary(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :43
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.500 1st Qu.:0.250 versicolor:42
Median :5.800 Median :3.000 Median :4.300 Median :1.300 virginica :45
Mean :5.861 Mean :3.061 Mean :3.728 Mean :1.181 NA's :20
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.200 Max. :6.900 Max. :2.500
NA's :18 NA's :19 NA's :19 NA's :19
iris2 <- kNN(iris)
summary(iris2)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.00 Min. :1.000 Min. :0.1 setosa :50
1st Qu.:5.100 1st Qu.:2.80 1st Qu.:1.600 1st Qu.:0.3 versicolor:50
Median :5.800 Median :3.00 Median :4.250 Median :1.3 virginica :50
Mean :5.841 Mean :3.06 Mean :3.775 Mean :1.2
3rd Qu.:6.475 3rd Qu.:3.30 3rd Qu.:5.100 3rd Qu.:1.8
Max. :7.900 Max. :4.20 Max. :6.900 Max. :2.5
Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp Species_imp
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:132 FALSE:131 FALSE:131 FALSE:131 FALSE:130
TRUE :18 TRUE :19 TRUE :19 TRUE :19 TRUE :20
iris2 <- iris2[1:5]
iris2 <- subset(iris2, select=Sepal.Leng)
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
[1] 19
num_complete_cases <- sum(complete.cases(dirty_iris))
percent_complete_cases <- (num_complete_cases / nrow(dirty_iris)) * 100
cat("Number of complete observations:", num_complete_cases, "\n")
Number of complete observations: 96
cat("Percentage of complete observations:", percent_complete_cases, "%\n")
Percentage of complete observations: 64 %
special_values <- sapply(dirty_iris, function(x) {
if (is.numeric(x)) {
list(
Inf_values = sum(is.infinite(x)),
NaN_values = sum(is.nan(x))
)
} else {
NULL
}
})
print(special_values)
$Sepal.Length
$Sepal.Length$Inf_values
[1] 0
$Sepal.Length$NaN_values
[1] 0
$Sepal.Width
$Sepal.Width$Inf_values
[1] 0
$Sepal.Width$NaN_values
[1] 0
$Petal.Length
$Petal.Length$Inf_values
[1] 0
$Petal.Length$NaN_values
[1] 0
$Petal.Width
$Petal.Width$Inf_values
[1] 1
$Petal.Width$NaN_values
[1] 0
$Species
NULL
dirty_iris[] <- lapply(dirty_iris, function(x) {
if (is.numeric(x)) {
x[is.infinite(x)] <- NA # Replace Inf and -Inf with NA
x[is.nan(x)] <- NA # Replace NaN with NA
}
return(x)
})
sapply(dirty_iris, function(x) sum(is.na(x)))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
10 17 19 13 0
invalid_sepal_width <- dirty_iris$Sepal.Width <= 0
invalid_sepal_length <- dirty_iris$Sepal.Length > 30
violating_rows <- which(invalid_sepal_width | invalid_sepal_length)
num_violations <- length(violating_rows)
cat("Number of observations violating the rules:", num_violations, "\n")
Number of observations violating the rules: 4
dirty_iris[violating_rows, ]
NA
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)
model_sepal_length <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predict(model_sepal_length, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9Cm1hcmtzIDwtIGMoODksIDk1LCA4NSwgTkEsIDc2LCA2OSwgNzgsIE5BLCA3MywgTkEsIDkwLCA2OSkKYGBgCiMgbWVhbiBhbmQgbWVkaWFuIGltcHV0YXRpb24KYGBge3J9Cm1hcmtzMVshaXMubmEobWFya3MxKV0gCgptZWFuKG1hcmtzLCBuYS5ybT1UUlVFKQptZWRpYW4obWFya3MsIG5hLnJtID0gVFJVRSkKCm1lYW4obWFya3NbIWlzLm5hKG1hcmtzKV0pCmBgYAoKYGBge3J9CnBhY21hbjo6cF9sb2FkKEhtaXNjKQpgYGAKCmBgYHtyfQpwYWNtYW46OnBfbG9hZChIbWlzYykKYGBgCiMgUmF0aW8KYGBge3J9Cm9sZF9tYXJrcyA8LSBjKDkwLCAxMDAsIDc1LCA4OSwgODAsIDc3LCA4OCwgNjksIDc1LCA4OCwgOTUsIDcwKQptYXJrcyA8LSBjKDg5LCA5NSwgODUsIE5BLCA3NiwgNjksIDc4LCBOQSwgNzMsIE5BLCA5MCwgNjkpCmBgYAoKYGBge3J9Cm1pc3NpbmdfbG9jYXRpb24gPC0gaXMubmEobWFya3MpCm1pc3NpbmdfbG9jYXRpb24KYGBgCmBgYHtyfQpzdW0ob2xkX21hcmtzKQpgYGAKYGBge3J9CnN1bShtYXJrcywgbmEucm0gPSBUUlVFKQpgYGAKYGBge3J9CnBhY21hbjo6cF9sb2FkKEhtaXNjLFZJTSkKYGBgCgpgYGB7cn0KUiA8LSBzdW0ob2xkX21hcmtzWyFtaXNzaW5nX2xvY2F0aW9uXSkvc3VtKG1hcmtzLCBuYS5ybSA9IFRSVUUpClIKYGBgCmBgYHtyfQpkYXRhKCJpcmlzIikgIyBsb2FkIHRoZSBkYXRhCmhlYWQoaXJpcykKYGBgCmBgYHtyfQppcmlzJFNlcGFsLkxlbmd0aFsxOjEwXSA8LSBOQQppcmlzJFNlcGFsLkxlbmd0aFsxOjEwXSA8LSBOQQpoZWFkKGlyaXMpCmBgYApgYGB7cn0KbW9kZWwgPC0gbG0oU2VwYWwuTGVuZ3RoIH4gU2VwYWwuV2lkdGggKyBQZXRhbC5MZW5ndGgsIGRhdGEgPSBpcmlzKQptb2RlbApgYGAKYGBge3J9CkkgPC0gaXMubmEoaXJpcyRTZXBhbC5MZW5ndGgpCgppcmlzJFNlcGFsLkxlbmd0aFtJXSA8LSBwcmVkaWN0KG1vZGVsLG5ld2RhdGEgPSBpcmlzW0ksXSkKaGVhZChpcmlzLDEwKQpgYGAKCmBgYHtyfQpkYXRhKGlyaXMpCmBgYAoKYGBge3J9CmZvciAoaSBpbiAxOm5jb2woaXJpcykpIHsKIGlyaXNbc2FtcGxlKG5yb3coaXJpcyksIDEwLCByZXBsYWNlID0gRkFMU0UpLGldIDwtIE5BCn0KCnN1bW1hcnkoaXJpcykKYGBgCgpgYGB7cn0KaXJpczIgPC0ga05OKGlyaXMpCnN1bW1hcnkoaXJpczIpCmBgYApgYGB7cn0KaXJpczIgPC0gaXJpczJbMTo1XQppcmlzMiA8LSBzdWJzZXQoaXJpczIsIHNlbGVjdD1TZXBhbC5MZW5nKQpgYGAKCmBgYHtyfQpkaXJ0eV9pcmlzIDwtIHJlYWQuY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vZWR3aW5kai9kYXRhY2xlYW5pbmcvbWFzdGVyL2RhdGEvZGlydHlfaXJpcy5jc3YiKQpgYGAKCmBgYHtyfQpzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQpgYGAKCmBgYHtyfQpudW1fY29tcGxldGVfY2FzZXMgPC0gc3VtKGNvbXBsZXRlLmNhc2VzKGRpcnR5X2lyaXMpKQpwZXJjZW50X2NvbXBsZXRlX2Nhc2VzIDwtIChudW1fY29tcGxldGVfY2FzZXMgLyBucm93KGRpcnR5X2lyaXMpKSAqIDEwMApjYXQoIk51bWJlciBvZiBjb21wbGV0ZSBvYnNlcnZhdGlvbnM6IiwgbnVtX2NvbXBsZXRlX2Nhc2VzLCAiXG4iKQpjYXQoIlBlcmNlbnRhZ2Ugb2YgY29tcGxldGUgb2JzZXJ2YXRpb25zOiIsIHBlcmNlbnRfY29tcGxldGVfY2FzZXMsICIlXG4iKQpgYGAKCmBgYHtyfQpzcGVjaWFsX3ZhbHVlcyA8LSBzYXBwbHkoZGlydHlfaXJpcywgZnVuY3Rpb24oeCkgewogIGlmIChpcy5udW1lcmljKHgpKSB7CiAgICBsaXN0KAogICAgICBJbmZfdmFsdWVzID0gc3VtKGlzLmluZmluaXRlKHgpKSwKICAgICAgTmFOX3ZhbHVlcyA9IHN1bShpcy5uYW4oeCkpCiAgICApCiAgfSBlbHNlIHsKICAgIE5VTEwKICB9Cn0pCnByaW50KHNwZWNpYWxfdmFsdWVzKQpgYGAKCmBgYHtyfQpkaXJ0eV9pcmlzW10gPC0gbGFwcGx5KGRpcnR5X2lyaXMsIGZ1bmN0aW9uKHgpIHsKICBpZiAoaXMubnVtZXJpYyh4KSkgewogICAgeFtpcy5pbmZpbml0ZSh4KV0gPC0gTkEgICMgUmVwbGFjZSBJbmYgYW5kIC1JbmYgd2l0aCBOQQogICAgeFtpcy5uYW4oeCldIDwtIE5BICAgICAgICAjIFJlcGxhY2UgTmFOIHdpdGggTkEKICB9CiAgcmV0dXJuKHgpCn0pCnNhcHBseShkaXJ0eV9pcmlzLCBmdW5jdGlvbih4KSBzdW0oaXMubmEoeCkpKQoKCmBgYApgYGB7cn0KaW52YWxpZF9zZXBhbF93aWR0aCA8LSBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDAKaW52YWxpZF9zZXBhbF9sZW5ndGggPC0gZGlydHlfaXJpcyRTZXBhbC5MZW5ndGggPiAzMAp2aW9sYXRpbmdfcm93cyA8LSB3aGljaChpbnZhbGlkX3NlcGFsX3dpZHRoIHwgaW52YWxpZF9zZXBhbF9sZW5ndGgpCm51bV92aW9sYXRpb25zIDwtIGxlbmd0aCh2aW9sYXRpbmdfcm93cykKY2F0KCJOdW1iZXIgb2Ygb2JzZXJ2YXRpb25zIHZpb2xhdGluZyB0aGUgcnVsZXM6IiwgbnVtX3Zpb2xhdGlvbnMsICJcbiIpCmRpcnR5X2lyaXNbdmlvbGF0aW5nX3Jvd3MsIF0KCmBgYAoKYGBge3J9CmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCldIDwtIG1lYW4oZGlydHlfaXJpcyRTZXBhbC5XaWR0aCwgbmEucm0gPSBUUlVFKQoKZGlydHlfaXJpcyRQZXRhbC5MZW5ndGhbaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpXSA8LSBtZWRpYW4oZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgsIG5hLnJtID0gVFJVRSkKCm1vZGVsX3NlcGFsX2xlbmd0aCA8LSBsbShTZXBhbC5MZW5ndGggfiBTZXBhbC5XaWR0aCArIFBldGFsLkxlbmd0aCArIFBldGFsLldpZHRoLCBkYXRhID0gZGlydHlfaXJpcykKCmRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoW2lzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKV0gPC0gcHJlZGljdChtb2RlbF9zZXBhbF9sZW5ndGgsIG5ld2RhdGEgPSBkaXJ0eV9pcmlzW2lzLm5hKGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoKSwgXSkKCgpgYGAKCg==