Assignment5INT

marks <- c(89,95,85,NA,76,69,78,NA,73,NA,90,69)

marks1 <- marks
marks2 <- marks

marks1[is.na(marks)] <- mean(marks, na.rm = TRUE)
marks2[is.na(marks)] <- median(marks, na.rm = TRUE)

marks1

##  [1] 89.00000 95.00000 85.00000 80.44444 76.00000 69.00000 78.00000 80.44444
##  [9] 73.00000 80.44444 90.00000 69.00000

marks2

##  [1] 89 95 85 78 76 69 78 78 73 78 90 69

pacman:: p_load(Hmisc, VIM)

marks1 <- impute(marks, fun=mean)
marks2 <- impute(marks, fun=median)

marks1

##         1         2         3         4         5         6         7         8 
##  89.00000  95.00000  85.00000 80.44444*  76.00000  69.00000  78.00000 80.44444* 
##         9        10        11        12 
##  73.00000 80.44444*  90.00000  69.00000

is.imputed(marks1)

##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE

marks <- c(89,95,85,NA,76,69,78,NA,73,NA,90,69)
old_marks <- c(90, 100, 75, 89, 80, 77, 88, 69, 75, 88, 95, 70)

sum(old_marks[!is.na(marks)])

## [1] 750

sum(marks, na.rm = TRUE)

## [1] 724

R <- sum(old_marks[!is.na(marks)])/sum(marks, na.rm = TRUE)
R

## [1] 1.035912

marks[is.na(marks)] <- old_marks[is.na(marks)] / R
marks

##  [1] 89.00000 95.00000 85.00000 85.91467 76.00000 69.00000 78.00000 66.60800
##  [9] 73.00000 84.94933 90.00000 69.00000

data(iris)
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

iris$Sepal.Length[1:10] <- NA
head(iris)

model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data=iris)
model

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
## 
## Coefficients:
##  (Intercept)   Sepal.Width  Petal.Length  
##       2.2716        0.5931        0.4689

I <- is.na(iris$Sepal.Length)
iris$Sepal.Length[I] <- predict(model, newdata = iris[I,])
head(iris,10)

for (i in 1:ncol(iris)) {
  iris[sample(1:nrow(iris), 10, replace = FALSE) ,i] <- NA
}
summary(iris)

##   Sepal.Length   Sepal.Width     Petal.Length    Petal.Width          Species  
##  Min.   :4.30   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :46  
##  1st Qu.:5.10   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:48  
##  Median :5.70   Median :3.000   Median :4.300   Median :1.300   virginica :46  
##  Mean   :5.82   Mean   :3.059   Mean   :3.744   Mean   :1.181   NA's      :10  
##  3rd Qu.:6.40   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.90   Max.   :4.400   Max.   :6.900   Max.   :2.500                  
##  NA's   :10     NA's   :10      NA's   :10      NA's   :10

iris2 <- kNN(iris)
summary(iris2)

##   Sepal.Length   Sepal.Width     Petal.Length    Petal.Width          Species  
##  Min.   :4.30   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
##  1st Qu.:5.10   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:49  
##  Median :5.80   Median :3.000   Median :4.300   Median :1.300   virginica :51  
##  Mean   :5.84   Mean   :3.057   Mean   :3.757   Mean   :1.207                  
##  3rd Qu.:6.40   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.90   Max.   :4.400   Max.   :6.900   Max.   :2.500                  
##  Sepal.Length_imp Sepal.Width_imp Petal.Length_imp Petal.Width_imp
##  Mode :logical    Mode :logical   Mode :logical    Mode :logical  
##  FALSE:140        FALSE:140       FALSE:140        FALSE:140      
##  TRUE :10         TRUE :10        TRUE :10         TRUE :10       
##                                                                   
##                                                                   
##                                                                   
##  Species_imp    
##  Mode :logical  
##  FALSE:140      
##  TRUE :10       
##                 
##                 
##

iris2 <- iris2[,1:5]
iris2 <- subset(iris2, select=Sepal.Length:Petal.Width)

summary(iris2)

##   Sepal.Length   Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.30   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.10   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.80   Median :3.000   Median :4.300   Median :1.300  
##  Mean   :5.84   Mean   :3.057   Mean   :3.757   Mean   :1.207  
##  3rd Qu.:6.40   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.90   Max.   :4.400   Max.   :6.900   Max.   :2.500

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

missing_petal_length <- sum(is.na(dirty_iris$Petal.Length))
cat("Number of missing values in Petal.Length:", missing_petal_length, "\n")

## Number of missing values in Petal.Length: 19

complete_obs <- sum(complete.cases(dirty_iris))
cat("Number of complete observations:", complete_obs, "\n")

## Number of complete observations: 96

total_obs <- nrow(dirty_iris)

percent_complete <- (complete_obs / total_obs) * 100
cat("Percentage of complete observations:", percent_complete, "%\n")

## Percentage of complete observations: 64 %

numeric_cols <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")

check_special_values <- function(column) {
  na_count <- sum(is.na(column))
  inf_count <- sum(is.infinite(column), na.rm = TRUE)  
  nan_count <- sum(is.nan(column), na.rm = TRUE)
  cat("Column:", deparse(substitute(column)), "\n")
  cat("  NA count:", na_count, "\n")
  cat("  Inf count:", inf_count, "\n")
  cat("  NaN count:", nan_count, "\n\n")
}


for (col in numeric_cols) {
  check_special_values(dirty_iris[[col]])
}

## Column: dirty_iris[[col]] 
##   NA count: 10 
##   Inf count: 0 
##   NaN count: 0 
## 
## Column: dirty_iris[[col]] 
##   NA count: 17 
##   Inf count: 0 
##   NaN count: 0 
## 
## Column: dirty_iris[[col]] 
##   NA count: 19 
##   Inf count: 0 
##   NaN count: 0 
## 
## Column: dirty_iris[[col]] 
##   NA count: 12 
##   Inf count: 1 
##   NaN count: 0

cat("Total rows with any Inf in numeric columns:\n")

## Total rows with any Inf in numeric columns:

inf_rows <- apply(dirty_iris[numeric_cols], 1, function(row) any(is.infinite(row)))
cat(sum(inf_rows, na.rm = TRUE), "rows with Inf\n")

## 1 rows with Inf

cat("Total rows with any NaN in numeric columns:\n")

## Total rows with any NaN in numeric columns:

nan_rows <- apply(dirty_iris[numeric_cols], 1, function(row) any(is.nan(row)))
cat(sum(nan_rows, na.rm = TRUE), "rows with NaN\n")

## 0 rows with NaN

cat("Initial Inf count in Petal.Width:", sum(is.infinite(dirty_iris$Petal.Width)), "\n")

## Initial Inf count in Petal.Width: 1

cat("Initial NA count in Petal.Width:", sum(is.na(dirty_iris$Petal.Width)), "\n")

## Initial NA count in Petal.Width: 12

inf_row <- which(is.infinite(dirty_iris$Petal.Width))
if (length(inf_row) > 0) {
  cat("Row with Inf in Petal.Width:", inf_row, "\n")
  cat("Original value at row", inf_row, ":", dirty_iris$Petal.Width[inf_row], "\n")
} else {
  cat("No Inf found in Petal.Width\n")
}

## Row with Inf in Petal.Width: 86 
## Original value at row 86 : Inf

dirty_iris$Petal.Width[is.infinite(dirty_iris$Petal.Width)] <- NA

cat("\nAfter replacement:\n")

## 
## After replacement:

cat("Inf count in Petal.Width:", sum(is.infinite(dirty_iris$Petal.Width)), "\n")

## Inf count in Petal.Width: 0

cat("NA count in Petal.Width:", sum(is.na(dirty_iris$Petal.Width)), "\n")

## NA count in Petal.Width: 13

if (length(inf_row) > 0) {
  cat("Value at row", inf_row, "is now:", dirty_iris$Petal.Width[inf_row], "\n")
}

## Value at row 86 is now: NA

violations <- dirty_iris$Sepal.Width <= 0 | dirty_iris$Sepal.Length > 30

violating_rows <- dirty_iris[violations, ]

num_violations <- sum(violations, na.rm = TRUE)

cat("Number of observations violating the rules:", num_violations, "\n\n")

## Number of observations violating the rules: 4

cat("Observations violating the rules:\n")

## Observations violating the rules:

print(violating_rows)

##       Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## NA              NA          NA           NA          NA       <NA>
## NA.1            NA          NA           NA          NA       <NA>
## NA.2            NA          NA           NA          NA       <NA>
## 16             5.0          -3          3.5         1.0 versicolor
## NA.3            NA          NA           NA          NA       <NA>
## NA.4            NA          NA           NA          NA       <NA>
## NA.5            NA          NA           NA          NA       <NA>
## NA.6            NA          NA           NA          NA       <NA>
## NA.7            NA          NA           NA          NA       <NA>
## 28            73.0          29         63.0          NA  virginica
## NA.8            NA          NA           NA          NA       <NA>
## NA.9            NA          NA           NA          NA       <NA>
## NA.10           NA          NA           NA          NA       <NA>
## NA.11           NA          NA           NA          NA       <NA>
## NA.12           NA          NA           NA          NA       <NA>
## NA.13           NA          NA           NA          NA       <NA>
## NA.14           NA          NA           NA          NA       <NA>
## NA.15           NA          NA           NA          NA       <NA>
## NA.16           NA          NA           NA          NA       <NA>
## NA.17           NA          NA           NA          NA       <NA>
## NA.18           NA          NA           NA          NA       <NA>
## NA.19           NA          NA           NA          NA       <NA>
## NA.20           NA          NA           NA          NA       <NA>
## NA.21           NA          NA           NA          NA       <NA>
## 125           49.0          30         14.0         2.0     setosa
## NA.22           NA          NA           NA          NA       <NA>
## 130            5.7           0          1.7         0.3     setosa
## NA.23           NA          NA           NA          NA       <NA>
## NA.24           NA          NA           NA          NA       <NA>
## NA.25           NA          NA           NA          NA       <NA>
## NA.26           NA          NA           NA          NA       <NA>

# Locate observations where Sepal.Width <= 0
violations_width <- dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)
violating_rows <- dirty_iris[violations_width, ]

num_violations <- sum(violations_width, na.rm = TRUE)
cat("Number of observations with Sepal.Width <= 0 (excluding NA):", num_violations, "\n")

## Number of observations with Sepal.Width <= 0 (excluding NA): 2

cat("Observations violating Sepal.Width > 0 rule:\n")

## Observations violating Sepal.Width > 0 rule:

print(violating_rows)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 130          5.7           0          1.7         0.3     setosa

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)] <- abs(dirty_iris$Sepal.Width[dirty_iris$Sepal.Width < 0 & !is.na(dirty_iris$Sepal.Width)])
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0 & !is.na(dirty_iris$Sepal.Width)] <- NA

cat("\nAfter corrections:\n")

## 
## After corrections:

cat("Number of Sepal.Width <= 0 (excluding NA):", sum(dirty_iris$Sepal.Width <= 0 & !is.na(dirty_iris$Sepal.Width)), "\n")

## Number of Sepal.Width <= 0 (excluding NA): 0

cat("Updated rows that originally violated the rule:\n")

## Updated rows that originally violated the rule:

print(dirty_iris[which(violations_width), ])

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0           3          3.5         1.0 versicolor
## 130          5.7          NA          1.7         0.3     setosa

cat("Updated NA count in Sepal.Width:", sum(is.na(dirty_iris$Sepal.Width)), "\n")

## Updated NA count in Sepal.Width: 18

cat("Before imputation - NA counts:\n")

## Before imputation - NA counts:

print(sapply(dirty_iris, function(x) sum(is.na(x))))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           10           18           19           13            0

sepal_width_mean <- mean(dirty_iris$Sepal.Width, na.rm = TRUE)
dirty_iris$Sepal.Width[is.na(dirty_iris$Sepal.Width)] <- sepal_width_mean
cat("\nSepal.Width imputed with mean:", sepal_width_mean, "\n")

## 
## Sepal.Width imputed with mean: 3.462121

petal_length_median <- median(dirty_iris$Petal.Length, na.rm = TRUE)
dirty_iris$Petal.Length[is.na(dirty_iris$Petal.Length)] <- petal_length_median
cat("Petal.Length imputed with median:", petal_length_median, "\n")

## Petal.Length imputed with median: 4.5

lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
               data = dirty_iris, subset = !is.na(Sepal.Length))
predicted_sepal_length <- predict(lm_model, newdata = dirty_iris)
dirty_iris$Sepal.Length[is.na(dirty_iris$Sepal.Length)] <- predicted_sepal_length[is.na(dirty_iris$Sepal.Length)]
cat("Sepal.Length imputed with linear regression\n")

## Sepal.Length imputed with linear regression

dirty_iris <- kNN(dirty_iris, variable = "Petal.Width", k = 5, numFun = median, catFun = maxCat)

dirty_iris <- dirty_iris[, 1:5]  # Keep only original columns
cat("Petal.Width imputed with kNN (k=5)\n")

## Petal.Width imputed with kNN (k=5)

cat("\nAfter imputation - NA counts:\n")

## 
## After imputation - NA counts:

print(sapply(dirty_iris, function(x) sum(is.na(x))))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

cat("\nSample of imputed dataset:\n")

## 
## Sample of imputed dataset:

print(head(dirty_iris))

##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          6.4    3.200000          4.5         1.5 versicolor
## 2          6.3    3.300000          6.0         2.5  virginica
## 3          6.2    3.462121          5.4         2.3  virginica
## 4          5.0    3.400000          1.6         0.4     setosa
## 5          5.7    2.600000          3.5         1.0 versicolor
## 6          5.3    3.462121          4.5         0.2     setosa

Assignment5INT

Daniel Haran