data <- c(1, 2, 3, NA, 5, NA, 7, 8, 9, 10)
data
## [1] 1 2 3 NA 5 NA 7 8 9 10
clean_data <- data[!is.na(data)]
clean_data
## [1] 1 2 3 5 7 8 9 10
data_table <- data.frame(
ID = c(1, 2, 3, 4, 5, 6, 7),
Value = c(10, 20, NA, 40, 50, NA, 70),
Category = c("A", "B", "A", "C", "B", "C", "A")
)
data_table[complete.cases(data_table), ]
## ID Value Category
## 1 1 10 A
## 2 2 20 B
## 4 4 40 C
## 5 5 50 B
## 7 7 70 A
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(airquality)
preprocess <- preProcess(airquality, method = c("medianImpute"))
airquality_imputed <- predict(preprocess, airquality)
summary(airquality_imputed)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 21.00 1st Qu.:120.0 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 39.56 Mean :186.8 Mean : 9.958 Mean :77.88
## 3rd Qu.: 46.00 3rd Qu.:256.0 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
set.seed(42)
data1 <- c(rnorm(50, mean = 10, sd = 2), 30)
data2 <- c(rnorm(50, mean = 20, sd = 3), 50)
boxplot(data1, main="Выбросы в data1")
data1_clean <- data1[data1 < quantile(data1, 0.95)]
data_dup <- data.frame(ID = c(1, 2, 2, 3, 4, 4, 5), Value = c(10, 20, 20, 30, 40, 40, 50))
unique(data_dup)
## ID Value
## 1 1 10
## 2 2 20
## 4 3 30
## 5 4 40
## 7 5 50
data_dup[!duplicated(data_dup), ]
## ID Value
## 1 1 10
## 2 2 20
## 4 3 30
## 5 4 40
## 7 5 50
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
imp <- mice(airquality, method = "pmm", m = 5)
##
## iter imp variable
## 1 1 Ozone Solar.R
## 1 2 Ozone Solar.R
## 1 3 Ozone Solar.R
## 1 4 Ozone Solar.R
## 1 5 Ozone Solar.R
## 2 1 Ozone Solar.R
## 2 2 Ozone Solar.R
## 2 3 Ozone Solar.R
## 2 4 Ozone Solar.R
## 2 5 Ozone Solar.R
## 3 1 Ozone Solar.R
## 3 2 Ozone Solar.R
## 3 3 Ozone Solar.R
## 3 4 Ozone Solar.R
## 3 5 Ozone Solar.R
## 4 1 Ozone Solar.R
## 4 2 Ozone Solar.R
## 4 3 Ozone Solar.R
## 4 4 Ozone Solar.R
## 4 5 Ozone Solar.R
## 5 1 Ozone Solar.R
## 5 2 Ozone Solar.R
## 5 3 Ozone Solar.R
## 5 4 Ozone Solar.R
## 5 5 Ozone Solar.R
airquality_complete <- complete(imp)
summary(airquality_complete)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.0 1st Qu.: 7.400 1st Qu.:72.00
## Median : 34.00 Median :203.0 Median : 9.700 Median :79.00
## Mean : 42.83 Mean :184.7 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.00 3rd Qu.:258.0 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## Month Day
## Min. :5.000 Min. : 1.0
## 1st Qu.:6.000 1st Qu.: 8.0
## Median :7.000 Median :16.0
## Mean :6.993 Mean :15.8
## 3rd Qu.:8.000 3rd Qu.:23.0
## Max. :9.000 Max. :31.0
install.packages("car")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
data_multicol <- data.frame(
X1 = rnorm(100, 10, 3),
X2 = rnorm(100, 20, 5),
X3 = rnorm(100, 30, 7)
)
data_multicol$X4 <- data_multicol$X1 * 2 + rnorm(100, 0, 1)
vif(lm(X4 ~ X1 + X2 + X3, data = data_multicol))
## X1 X2 X3
## 1.048930 1.042652 1.090752
Выводы: Работа выполнена, пропущенные значения обработаны, выбросы удалены, дубликаты устранены, а мультиколлинеарность проанализирована.