Olivia Thompson

2/28/25

Question One

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

missing_values <- sum(is.na(dirty_iris$Petal.Length))
missing_values

Question Two

complete_cases <- complete.cases(dirty_iris)
complete_cases
num_complete_cases <- sum(complete_cases)
percentage_complete_cases <- (num_complete_cases / nrow(dirty_iris)) * 100
num_complete_cases
percentage_complete_cases

Question Three


numeric_columns <- dirty_iris[sapply(dirty_iris, is.numeric)]
negative_inf_values <- sapply(numeric_columns, function(x) sum(is.infinite(x) & x < 0))
negative_inf_values
numeric_columns <- dirty_iris[sapply(dirty_iris, is.numeric)]
positive_inf_values <- sapply(numeric_columns, function(x) sum(is.infinite(x) & x > 0))
positive_inf_values

Question Four

numeric_columns <- dirty_iris[sapply(dirty_iris, is.numeric)]
inf_rows <- sapply(numeric_columns, function(x) which(is.infinite(x)))
inf_rows
dirty_iris[sapply(dirty_iris, is.numeric)] <- apply(dirty_iris[sapply(dirty_iris, is.numeric)], 2, function(x) {
  x[is.infinite(x)] <- NA  
  return(x)
})
dirty_iris[86, ]

Question Five

violating_rows <- dirty_iris[dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30,]
num_violations <- nrow(violating_rows)
print(paste("Number of observations violating the rules:", num_violations))
print("Violating observations:")
print(violating_rows)
violating_sepal_width <- which(dirty_iris$Sepal.Width <= 0)
violating_sepal_length <- which(dirty_iris$Sepal.Length > 30)
violating_indices <- unique(c(violating_sepal_width, violating_sepal_length))
violating_rows <- dirty_iris[violating_indices, ]
n_violations <- length(violating_indices)
n_violations

Question Six

violating_indices <- which(!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width <= 0)

print("Violating Observations Before Correction:")
print(dirty_iris[violating_indices, ])


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0] <- 
  abs(dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0])


dirty_iris$Sepal.Width[!is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0] <- NA

print("Corrected Observations:")
print(dirty_iris[violating_indices, ])

Question Seven

pacman::p_load(dplyr, Hmisc)
also installing the dependencies ‘checkmate’, ‘gridExtra’, ‘htmlTable’, ‘viridis’, ‘Formula’

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/checkmate_2.3.2.tgz'
Content type 'application/x-gzip' length 778249 bytes (760 KB)
==================================================
downloaded 760 KB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/gridExtra_2.3.tgz'
Content type 'application/x-gzip' length 1105951 bytes (1.1 MB)
==================================================
downloaded 1.1 MB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/htmlTable_2.4.3.tgz'
Content type 'application/x-gzip' length 422165 bytes (412 KB)
==================================================
downloaded 412 KB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/viridis_0.6.5.tgz'
Content type 'application/x-gzip' length 3016921 bytes (2.9 MB)
==================================================
downloaded 2.9 MB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/Formula_1.2-5.tgz'
Content type 'application/x-gzip' length 158398 bytes (154 KB)
==================================================
downloaded 154 KB

trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/Hmisc_5.2-2.tgz'
Content type 'application/x-gzip' length 3603982 bytes (3.4 MB)
==================================================
downloaded 3.4 MB

The downloaded binary packages are in
    /var/folders/1l/8hyc8g4x08lgfhk57drrsq6h0000gn/T//Rtmpsd6XKD/downloaded_packages

Hmisc installed
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)

dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- median(dirty_iris$Petal.Length, na.rm = TRUE)

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predict(lm_model, newdata = dirty_iris[is.na(dirty_iris$Sepal.Length), ])

dirty_iris$Petal.Width <- impute(dirty_iris$Petal.Width, method = "knn")

head(dirty_iris)
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA1IgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCiMjIyBPbGl2aWEgVGhvbXBzb24KIyMjIDIvMjgvMjUgCgoKIyBRdWVzdGlvbiBPbmUKYGBge3J9CmRpcnR5X2lyaXMgPC0gcmVhZC5jc3YoImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9lZHdpbmRqL2RhdGFjbGVhbmluZy9tYXN0ZXIvZGF0YS9kaXJ0eV9pcmlzLmNzdiIpCgptaXNzaW5nX3ZhbHVlcyA8LSBzdW0oaXMubmEoZGlydHlfaXJpcyRQZXRhbC5MZW5ndGgpKQptaXNzaW5nX3ZhbHVlcwpgYGAKCiMgUXVlc3Rpb24gVHdvIApgYGB7cn0KY29tcGxldGVfY2FzZXMgPC0gY29tcGxldGUuY2FzZXMoZGlydHlfaXJpcykKY29tcGxldGVfY2FzZXMKbnVtX2NvbXBsZXRlX2Nhc2VzIDwtIHN1bShjb21wbGV0ZV9jYXNlcykKcGVyY2VudGFnZV9jb21wbGV0ZV9jYXNlcyA8LSAobnVtX2NvbXBsZXRlX2Nhc2VzIC8gbnJvdyhkaXJ0eV9pcmlzKSkgKiAxMDAKbnVtX2NvbXBsZXRlX2Nhc2VzCnBlcmNlbnRhZ2VfY29tcGxldGVfY2FzZXMKYGBgCiMgUXVlc3Rpb24gVGhyZWUKYGBge3J9CgpudW1lcmljX2NvbHVtbnMgPC0gZGlydHlfaXJpc1tzYXBwbHkoZGlydHlfaXJpcywgaXMubnVtZXJpYyldCm5lZ2F0aXZlX2luZl92YWx1ZXMgPC0gc2FwcGx5KG51bWVyaWNfY29sdW1ucywgZnVuY3Rpb24oeCkgc3VtKGlzLmluZmluaXRlKHgpICYgeCA8IDApKQpuZWdhdGl2ZV9pbmZfdmFsdWVzCmBgYAoKYGBge3J9Cm51bWVyaWNfY29sdW1ucyA8LSBkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0KcG9zaXRpdmVfaW5mX3ZhbHVlcyA8LSBzYXBwbHkobnVtZXJpY19jb2x1bW5zLCBmdW5jdGlvbih4KSBzdW0oaXMuaW5maW5pdGUoeCkgJiB4ID4gMCkpCnBvc2l0aXZlX2luZl92YWx1ZXMKCmBgYAojIFF1ZXN0aW9uIEZvdXIKYGBge3J9Cm51bWVyaWNfY29sdW1ucyA8LSBkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0KaW5mX3Jvd3MgPC0gc2FwcGx5KG51bWVyaWNfY29sdW1ucywgZnVuY3Rpb24oeCkgd2hpY2goaXMuaW5maW5pdGUoeCkpKQppbmZfcm93cwpgYGAKYGBge3J9CmRpcnR5X2lyaXNbc2FwcGx5KGRpcnR5X2lyaXMsIGlzLm51bWVyaWMpXSA8LSBhcHBseShkaXJ0eV9pcmlzW3NhcHBseShkaXJ0eV9pcmlzLCBpcy5udW1lcmljKV0sIDIsIGZ1bmN0aW9uKHgpIHsKICB4W2lzLmluZmluaXRlKHgpXSA8LSBOQSAgCiAgcmV0dXJuKHgpCn0pCmRpcnR5X2lyaXNbODYsIF0KYGBgCgojIFF1ZXN0aW9uIEZpdmUgCmBgYHtyfQp2aW9sYXRpbmdfcm93cyA8LSBkaXJ0eV9pcmlzW2RpcnR5X2lyaXMkU2VwYWwuV2lkdGggPD0gMCB8IGRpcnR5X2lyaXMkU2VwYWwuTGVuZ3RoID4gMzAsXQpudW1fdmlvbGF0aW9ucyA8LSBucm93KHZpb2xhdGluZ19yb3dzKQpwcmludChwYXN0ZSgiTnVtYmVyIG9mIG9ic2VydmF0aW9ucyB2aW9sYXRpbmcgdGhlIHJ1bGVzOiIsIG51bV92aW9sYXRpb25zKSkKcHJpbnQoIlZpb2xhdGluZyBvYnNlcnZhdGlvbnM6IikKcHJpbnQodmlvbGF0aW5nX3Jvd3MpCmBgYAoKYGBge3J9CnZpb2xhdGluZ19zZXBhbF93aWR0aCA8LSB3aGljaChkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDw9IDApCnZpb2xhdGluZ19zZXBhbF9sZW5ndGggPC0gd2hpY2goZGlydHlfaXJpcyRTZXBhbC5MZW5ndGggPiAzMCkKdmlvbGF0aW5nX2luZGljZXMgPC0gdW5pcXVlKGModmlvbGF0aW5nX3NlcGFsX3dpZHRoLCB2aW9sYXRpbmdfc2VwYWxfbGVuZ3RoKSkKdmlvbGF0aW5nX3Jvd3MgPC0gZGlydHlfaXJpc1t2aW9sYXRpbmdfaW5kaWNlcywgXQpuX3Zpb2xhdGlvbnMgPC0gbGVuZ3RoKHZpb2xhdGluZ19pbmRpY2VzKQpuX3Zpb2xhdGlvbnMKYGBgCgojIFF1ZXN0aW9uIFNpeCAKYGBge3J9CnZpb2xhdGluZ19pbmRpY2VzIDwtIHdoaWNoKCFpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLldpZHRoKSAmIGRpcnR5X2lyaXMkU2VwYWwuV2lkdGggPD0gMCkKCnByaW50KCJWaW9sYXRpbmcgT2JzZXJ2YXRpb25zIEJlZm9yZSBDb3JyZWN0aW9uOiIpCnByaW50KGRpcnR5X2lyaXNbdmlvbGF0aW5nX2luZGljZXMsIF0pCgoKZGlydHlfaXJpcyRTZXBhbC5XaWR0aFshaXMubmEoZGlydHlfaXJpcyRTZXBhbC5XaWR0aCkgJiBkaXJ0eV9pcmlzJFNlcGFsLldpZHRoIDwgMF0gPC0gCiAgYWJzKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbIWlzLm5hKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGgpICYgZGlydHlfaXJpcyRTZXBhbC5XaWR0aCA8IDBdKQoKCmRpcnR5X2lyaXMkU2VwYWwuV2lkdGhbIWlzLm5hKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGgpICYgZGlydHlfaXJpcyRTZXBhbC5XaWR0aCA9PSAwXSA8LSBOQQoKcHJpbnQoIkNvcnJlY3RlZCBPYnNlcnZhdGlvbnM6IikKcHJpbnQoZGlydHlfaXJpc1t2aW9sYXRpbmdfaW5kaWNlcywgXSkKYGBgCiMgUXVlc3Rpb24gU2V2ZW4gCmBgYHtyfQpwYWNtYW46OnBfbG9hZChkcGx5ciwgSG1pc2MpCgpkaXJ0eV9pcmlzJFNlcGFsLldpZHRoW2lzLm5hKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGgpXSA8LSBtZWFuKGRpcnR5X2lyaXMkU2VwYWwuV2lkdGgsIG5hLnJtID0gVFJVRSkKCmRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoW2lzLm5hKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoKV0gPC0gbWVkaWFuKGRpcnR5X2lyaXMkUGV0YWwuTGVuZ3RoLCBuYS5ybSA9IFRSVUUpCgpsbV9tb2RlbCA8LSBsbShTZXBhbC5MZW5ndGggfiBTZXBhbC5XaWR0aCArIFBldGFsLkxlbmd0aCArIFBldGFsLldpZHRoLCBkYXRhID0gZGlydHlfaXJpcykKZGlydHlfaXJpcyRTZXBhbC5MZW5ndGhbaXMubmEoZGlydHlfaXJpcyRTZXBhbC5MZW5ndGgpXSA8LSBwcmVkaWN0KGxtX21vZGVsLCBuZXdkYXRhID0gZGlydHlfaXJpc1tpcy5uYShkaXJ0eV9pcmlzJFNlcGFsLkxlbmd0aCksIF0pCgpkaXJ0eV9pcmlzJFBldGFsLldpZHRoIDwtIGltcHV0ZShkaXJ0eV9pcmlzJFBldGFsLldpZHRoLCBtZXRob2QgPSAia25uIikKCmhlYWQoZGlydHlfaXJpcykKYGBgCgoK