1. Создание датасета с NA

# Создание датасета
my_data <- c(1, 2, NA, 3, NA, 4, 5)
my_data
## [1]  1  2 NA  3 NA  4  5

2. Очистка данных

# Удаление NA
clean_data <- my_data[!is.na(my_data)]
clean_data
## [1] 1 2 3 4 5

3. Таблица данных с числовыми и текстовыми столбцами

# Создание таблицы
names <- c("Vlad", "Anya", NA, "Vitalya", "Ira")
ages <- c(23, 16, NA, 7, 20)
df <- data.frame(Name = names, Age = ages)
df
##      Name Age
## 1    Vlad  23
## 2    Anya  16
## 3    <NA>  NA
## 4 Vitalya   7
## 5     Ira  20
# Очистка
clean_df <- df[complete.cases(df), ]
clean_df
##      Name Age
## 1    Vlad  23
## 2    Anya  16
## 4 Vitalya   7
## 5     Ira  20

4. Функция preProcess

library(caret)

data("airquality")
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
# Заполнение медианой
preProc_median <- preProcess(airquality, method = "medianImpute")
filled_data <- predict(preProc_median, airquality)
summary(filled_data)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 21.00   1st Qu.:120.0   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 39.56   Mean   :186.8   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 46.00   3rd Qu.:256.0   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0

5. Функция boxplot

# Генерация данных
set1 <- c(10, 20, 35, 45, 500)
set2 <- c(600, 710, 810, 920, 1020)

# Построение графиков
boxplot(set1, main="Boxplot Set 1")

boxplot(set2, main="Boxplot Set 2")

# Функция удаления выбросов
remove_outliers <- function(x) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
  H <- 1.5 * IQR(x, na.rm = TRUE)
  x[x >= (qnt[1] - H) & x <= (qnt[2] + H)]
}

# Очистка данных
clean_set1 <- remove_outliers(set1)
clean_set2 <- remove_outliers(set2)
clean_set1
## [1] 10 20 35 45
clean_set2
## [1]  600  710  810  920 1020

6. Функции unique(), duplicated()

# Создание таблицы с дубликатами
df_dup <- data.frame(ID = c(1, 2, 2, 3, 4, 4),
                     Name = c("Anya", "Vlad", "Vlad", "Vitalya", "Ira", "Ira"))
df_dup
##   ID    Name
## 1  1    Anya
## 2  2    Vlad
## 3  2    Vlad
## 4  3 Vitalya
## 5  4     Ira
## 6  4     Ira
# Удаление дубликатов
unique_df <- unique(df_dup)
no_dup_df <- df_dup[!duplicated(df_dup), ]

unique_df
##   ID    Name
## 1  1    Anya
## 2  2    Vlad
## 4  3 Vitalya
## 5  4     Ira
no_dup_df
##   ID    Name
## 1  1    Anya
## 2  2    Vlad
## 4  3 Vitalya
## 5  4     Ira

7. Пакет mice

library(mice)

# Применение mice
imp <- mice(airquality, method = "pmm", m = 1)
## 
##  iter imp variable
##   1   1  Ozone  Solar.R
##   2   1  Ozone  Solar.R
##   3   1  Ozone  Solar.R
##   4   1  Ozone  Solar.R
##   5   1  Ozone  Solar.R
complete_data <- complete(imp)
summary(complete_data)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:118.0   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 34.00   Median :203.0   Median : 9.700   Median :79.00  
##  Mean   : 44.13   Mean   :186.5   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 66.00   3rd Qu.:258.0   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0

8. Мультиколлинеарность

library(car)

# Создание коррелированных переменных
set.seed(123)
x1 <- rnorm(100)
x2 <- x1 * 0.9 + rnorm(100, sd = 0.1)
y <- 5 + 2 * x1 + rnorm(100)

# Построение модели
model <- lm(y ~ x1 + x2)
summary(model)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8730 -0.6607 -0.1245  0.6214  2.0798 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.13507    0.09614  53.412   <2e-16 ***
## x1           1.65253    0.89193   1.853    0.067 .  
## x2           0.23811    0.98995   0.241    0.810    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9513 on 97 degrees of freedom
## Multiple R-squared:  0.7659, Adjusted R-squared:  0.761 
## F-statistic: 158.7 on 2 and 97 DF,  p-value: < 2.2e-16
# Проверка мультиколлинеарности
vif(model)
##       x1       x2 
## 72.51521 72.51521