set.seed(1)
n <- 100000
rv <- runif(n = n, min = 0, max = 1)
size <- 3
ii <- sample(n, size)
rv[ii] <- runif(n = size, min = -1, max = 0)
rv[39] <- 22
rv[52] <- -21
rv[91] <- 11
m <- matrix(round(rv, 2), ncol = 100)
write.csv(as.data.frame(m),
file = 'data_cleansing2_dirty_data.csv',
row.names = F, quote = F)
d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/data_cleansing2_dirty_data_exercise.csv')
library(DT)
m <- as.matrix(d)
datatable(d)
boxplot(m)

any(m < 0)
## [1] TRUE
any(abs(m) > 3*sd(m))
## [1] TRUE
jj <- 36:46
boxplot(m[, jj])

j <- 37
y <- m[, j]
barplot(height = y,
names.arg = 1:nrow(m))

ii <- 140:231
y <- m[ii, j]
barplot(height = y,
cex.names = 0.4,
names.arg = paste(ii, '\n(', y, ')'))

ii <- 194:206
y <- m[ii, j]
barplot(height = y,
cex.names = 0.4,
names.arg = paste(ii, '\n(', y, ')'))

m[200, j]
## V37
## 0.67
m[m < 0]
## [1] -42.00 -0.92 -0.55 -0.03
sigma <- sd(m)
m[abs(m) > 4*sigma]
## [1] -42 22 111
m.a <- NULL
for (j in 1:ncol(m))
{
for (i in 1:nrow(m))
{
if ( m[i, j] < 0 | m[i, j] > 4 * sigma )
{
cat('Row:', i, ', Col:', j, ', Value: ', m[i, j], fill = T)
m.a <- rbind(m.a, t(c(i, j, m[i, j])))
}
}
}
## Row: 2 , Col: 1 , Value: -42
## Row: 42 , Col: 1 , Value: 22
## Row: 99 , Col: 1 , Value: 111
## Row: 531 , Col: 27 , Value: -0.92
## Row: 675 , Col: 83 , Value: -0.55
## Row: 124 , Col: 85 , Value: -0.03
colnames(m.a) <- c('Row', 'Col', 'Value')
m.a
## Row Col Value
## [1,] 2 1 -42.00
## [2,] 42 1 22.00
## [3,] 99 1 111.00
## [4,] 531 27 -0.92
## [5,] 675 83 -0.55
## [6,] 124 85 -0.03