1. Создание датасета с NA
# Создание датасета
my_data <- c(1, 2, NA, 3, NA, 4, 5)
my_data
## [1] 1 2 NA 3 NA 4 5
2. Очистка данных
# Удаление NA
clean_data <- my_data[!is.na(my_data)]
clean_data
## [1] 1 2 3 4 5
3. Таблица данных с числовыми и текстовыми столбцами
# Создание таблицы
names <- c("Vlad", "Anya", NA, "Vitalya", "Ira")
ages <- c(23, 16, NA, 7, 20)
df <- data.frame(Name = names, Age = ages)
df
## Name Age
## 1 Vlad 23
## 2 Anya 16
## 3 <NA> NA
## 4 Vitalya 7
## 5 Ira 20
# Очистка
clean_df <- df[complete.cases(df), ]
clean_df
## Name Age
## 1 Vlad 23
## 2 Anya 16
## 4 Vitalya 7
## 5 Ira 20
4. Функция preProcess
library(caret)
data("airquality")
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
##
# Заполнение медианой
preProc_median <- preProcess(airquality, method = "medianImpute")
filled_data <- predict(preProc_median, airquality)
summary(filled_data)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 21.00 1st Qu.:120.0 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 39.56 Mean :186.8 Mean : 9.958 Mean :77.88
## 3rd Qu.: 46.00 3rd Qu.:256.0 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
5. Функция boxplot
# Генерация данных
set1 <- c(10, 20, 35, 45, 500)
set2 <- c(600, 710, 810, 920, 1020)
# Построение графиков
boxplot(set1, main="Boxplot Set 1")

boxplot(set2, main="Boxplot Set 2")

# Функция удаления выбросов
remove_outliers <- function(x) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
H <- 1.5 * IQR(x, na.rm = TRUE)
x[x >= (qnt[1] - H) & x <= (qnt[2] + H)]
}
# Очистка данных
clean_set1 <- remove_outliers(set1)
clean_set2 <- remove_outliers(set2)
clean_set1
## [1] 10 20 35 45
clean_set2
## [1] 600 710 810 920 1020
6. Функции unique(), duplicated()
# Создание таблицы с дубликатами
df_dup <- data.frame(ID = c(1, 2, 2, 3, 4, 4),
Name = c("Anya", "Vlad", "Vlad", "Vitalya", "Ira", "Ira"))
df_dup
## ID Name
## 1 1 Anya
## 2 2 Vlad
## 3 2 Vlad
## 4 3 Vitalya
## 5 4 Ira
## 6 4 Ira
# Удаление дубликатов
unique_df <- unique(df_dup)
no_dup_df <- df_dup[!duplicated(df_dup), ]
unique_df
## ID Name
## 1 1 Anya
## 2 2 Vlad
## 4 3 Vitalya
## 5 4 Ira
no_dup_df
## ID Name
## 1 1 Anya
## 2 2 Vlad
## 4 3 Vitalya
## 5 4 Ira
7. Пакет mice
library(mice)
# Применение mice
imp <- mice(airquality, method = "pmm", m = 1)
##
## iter imp variable
## 1 1 Ozone Solar.R
## 2 1 Ozone Solar.R
## 3 1 Ozone Solar.R
## 4 1 Ozone Solar.R
## 5 1 Ozone Solar.R
complete_data <- complete(imp)
summary(complete_data)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:118.0 1st Qu.: 7.400 1st Qu.:72.00
## Median : 34.00 Median :203.0 Median : 9.700 Median :79.00
## Mean : 44.13 Mean :186.5 Mean : 9.958 Mean :77.88
## 3rd Qu.: 66.00 3rd Qu.:258.0 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
8. Мультиколлинеарность
library(car)
# Создание коррелированных переменных
set.seed(123)
x1 <- rnorm(100)
x2 <- x1 * 0.9 + rnorm(100, sd = 0.1)
y <- 5 + 2 * x1 + rnorm(100)
# Построение модели
model <- lm(y ~ x1 + x2)
summary(model)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8730 -0.6607 -0.1245 0.6214 2.0798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.13507 0.09614 53.412 <2e-16 ***
## x1 1.65253 0.89193 1.853 0.067 .
## x2 0.23811 0.98995 0.241 0.810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9513 on 97 degrees of freedom
## Multiple R-squared: 0.7659, Adjusted R-squared: 0.761
## F-statistic: 158.7 on 2 and 97 DF, p-value: < 2.2e-16
# Проверка мультиколлинеарности
vif(model)
## x1 x2
## 72.51521 72.51521