1. Создание датасета с NA

data <- c(1, 2, 3, NA, 5, NA, 7, 8, 9, 10)
data
##  [1]  1  2  3 NA  5 NA  7  8  9 10

2. Очистка данных от NA

clean_data <- data[!is.na(data)]
clean_data
## [1]  1  2  3  5  7  8  9 10

3. Таблица с числовыми и текстовыми столбцами

data_table <- data.frame(
  ID = c(1, 2, 3, 4, 5, 6, 7),
  Value = c(10, 20, NA, 40, 50, NA, 70),
  Category = c("A", "B", "A", "C", "B", "C", "A")
)
data_table[complete.cases(data_table), ]
##   ID Value Category
## 1  1    10        A
## 2  2    20        B
## 4  4    40        C
## 5  5    50        B
## 7  7    70        A

4. Заполнение пропусков в airquality (caret)

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(airquality)
preprocess <- preProcess(airquality, method = c("medianImpute"))
airquality_imputed <- predict(preprocess, airquality)
summary(airquality_imputed)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 21.00   1st Qu.:120.0   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 39.56   Mean   :186.8   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 46.00   3rd Qu.:256.0   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0

5. Выбросы и их удаление

set.seed(42)
data1 <- c(rnorm(50, mean = 10, sd = 2), 30)
data2 <- c(rnorm(50, mean = 20, sd = 3), 50)
boxplot(data1, main="Выбросы в data1")

data1_clean <- data1[data1 < quantile(data1, 0.95)]

6. Удаление дубликатов

data_dup <- data.frame(ID = c(1, 2, 2, 3, 4, 4, 5), Value = c(10, 20, 20, 30, 40, 40, 50))
unique(data_dup)
##   ID Value
## 1  1    10
## 2  2    20
## 4  3    30
## 5  4    40
## 7  5    50
data_dup[!duplicated(data_dup), ]
##   ID Value
## 1  1    10
## 2  2    20
## 4  3    30
## 5  4    40
## 7  5    50

7. Обработка пропусков (mice)

library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
imp <- mice(airquality, method = "pmm", m = 5)
## 
##  iter imp variable
##   1   1  Ozone  Solar.R
##   1   2  Ozone  Solar.R
##   1   3  Ozone  Solar.R
##   1   4  Ozone  Solar.R
##   1   5  Ozone  Solar.R
##   2   1  Ozone  Solar.R
##   2   2  Ozone  Solar.R
##   2   3  Ozone  Solar.R
##   2   4  Ozone  Solar.R
##   2   5  Ozone  Solar.R
##   3   1  Ozone  Solar.R
##   3   2  Ozone  Solar.R
##   3   3  Ozone  Solar.R
##   3   4  Ozone  Solar.R
##   3   5  Ozone  Solar.R
##   4   1  Ozone  Solar.R
##   4   2  Ozone  Solar.R
##   4   3  Ozone  Solar.R
##   4   4  Ozone  Solar.R
##   4   5  Ozone  Solar.R
##   5   1  Ozone  Solar.R
##   5   2  Ozone  Solar.R
##   5   3  Ozone  Solar.R
##   5   4  Ozone  Solar.R
##   5   5  Ozone  Solar.R
airquality_complete <- complete(imp)
summary(airquality_complete)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.0   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 34.00   Median :203.0   Median : 9.700   Median :79.00  
##  Mean   : 42.83   Mean   :184.7   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.00   3rd Qu.:258.0   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0

8. Мультиколлинеарность

install.packages("car")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
data_multicol <- data.frame(
  X1 = rnorm(100, 10, 3),
  X2 = rnorm(100, 20, 5),
  X3 = rnorm(100, 30, 7)
)
data_multicol$X4 <- data_multicol$X1 * 2 + rnorm(100, 0, 1)
vif(lm(X4 ~ X1 + X2 + X3, data = data_multicol))
##       X1       X2       X3 
## 1.048930 1.042652 1.090752

Выводы: Работа выполнена, пропущенные значения обработаны, выбросы удалены, дубликаты устранены, а мультиколлинеарность проанализирована.