Dirty data

a <- read.csv(file = 'https://stats.dip.jp/01_ds/data/data_cleansing2_dirty_data_exercise.csv')
library(DT)
datatable(a)

Check

a1 <- as.matrix(a)
boxplot(a1)

any(a1 < 0)  
## [1] TRUE
any(abs(a1) > 3*sd(a1))
## [1] TRUE

Range

1

r1 <- 25:34
boxplot(a1[,r1])

r1e <- 27
y1 <- a1[,r1e]
barplot(height=y1,names.arg=1:nrow(a1))

identify1

i1 <- 525:605
yy1 <- a1[i1,r1e]
barplot(height = yy1,cex.names=0.4,names.arg=paste(i1,'\n(',yy1,')'))

2

r2 <- 79:88
boxplot(a1[,r2])

r2e <- 83
y2 <- a1[,r2e]
barplot(height = y2,names.arg=1:nrow(a1))

identify2

i2 <- 605:685
yy2 <- a1[i2,r2e]
barplot(height = yy2,cex.names=0.4,names.arg=paste(i2,'\n(',yy2,')'))

Direct

a1[a1<0]
## [1] -42.00  -0.92  -0.55  -0.03
sigma <- sd(a1)
a1[abs(a1)>4*sigma]
## [1] -42  22 111
res <- NULL
for (i in 1:ncol(a1))
{
  for (ii in 1:nrow(a1))
  {
    if (a1[ii,i]<0 | a1[ii,i]>4*sigma)
    {
      cat('Row:',ii,',Col:',i,',Value:',a1[ii,i],fill=T)
      res <- rbind(res,t(c(ii,i,a1[ii,i])))
    }
  }
}
## Row: 2 ,Col: 1 ,Value: -42
## Row: 42 ,Col: 1 ,Value: 22
## Row: 99 ,Col: 1 ,Value: 111
## Row: 531 ,Col: 27 ,Value: -0.92
## Row: 675 ,Col: 83 ,Value: -0.55
## Row: 124 ,Col: 85 ,Value: -0.03
colnames(res) <- c('Row','Col','Value')
res
##      Row Col  Value
## [1,]   2   1 -42.00
## [2,]  42   1  22.00
## [3,]  99   1 111.00
## [4,] 531  27  -0.92
## [5,] 675  83  -0.55
## [6,] 124  85  -0.03