rm(list = ls())
mydata
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.4 3.2 4.500 1.5 versicolor
## 2 6.3 3.3 6.000 2.5 virginica
## 3 6.2 NA 5.400 2.3 virginica
## 4 5.0 3.4 1.600 0.4 setosa
## 5 5.7 2.6 3.500 1.0 versicolor
## 6 5.3 NA NA 0.2 setosa
## 7 6.4 2.7 5.300 NA virginica
## 8 5.9 3.0 5.100 1.8 virginica
## 9 5.8 2.7 4.100 1.0 versicolor
## 10 4.8 3.1 1.600 0.2 setosa
## 11 5.0 3.5 1.600 0.6 setosa
## 12 6.0 2.7 5.100 1.6 versicolor
## 13 6.0 3.0 4.800 NA virginica
## 14 6.8 2.8 4.800 1.4 versicolor
## 15 NA 3.9 1.700 0.4 setosa
## 16 5.0 -3.0 3.500 1.0 versicolor
## 17 5.5 NA 4.000 1.3 versicolor
## 18 4.7 3.2 1.300 0.2 setosa
## 19 NA 4.0 NA 0.2 setosa
## 20 5.6 NA 4.200 1.3 versicolor
## 21 4.9 3.6 NA 0.1 setosa
## 22 5.4 NA 4.500 1.5 versicolor
## 23 6.2 2.8 NA 1.8 virginica
## 24 6.7 3.3 5.700 2.5 virginica
## 25 NA 3.0 5.900 2.1 virginica
## 26 4.6 3.2 1.400 0.2 setosa
## 27 4.9 3.1 1.500 0.1 setosa
## 28 73.0 29.0 63.000 NA virginica
## 29 6.5 3.2 5.100 2.0 virginica
## 30 NA 2.8 0.820 1.3 versicolor
## 31 4.4 3.2 NA 0.2 setosa
## 32 5.9 3.2 4.800 NA versicolor
## 33 5.7 2.8 4.500 1.3 versicolor
## 34 6.2 2.9 NA 1.3 versicolor
## 35 6.6 2.9 23.000 1.3 versicolor
## 36 4.8 3.0 1.400 0.1 setosa
## 37 6.5 3.0 5.500 1.8 virginica
## 38 6.2 2.2 4.500 1.5 versicolor
## 39 6.7 2.5 5.800 1.8 virginica
## 40 5.0 3.0 1.600 0.2 setosa
## 41 5.0 NA 1.200 0.2 setosa
## 42 5.8 2.7 3.900 1.2 versicolor
## 43 0.0 NA 1.300 0.4 setosa
## 44 5.8 2.7 5.100 1.9 virginica
## 45 5.5 4.2 1.400 0.2 setosa
## 46 7.7 2.8 6.700 2.0 virginica
## 47 5.7 NA NA 0.4 setosa
## 48 7.0 3.2 4.700 1.4 versicolor
## 49 6.5 3.0 5.800 2.2 virginica
## 50 6.0 3.4 4.500 1.6 versicolor
## 51 5.5 2.6 4.400 1.2 versicolor
## 52 4.9 3.1 NA 0.2 setosa
## 53 5.2 2.7 3.900 1.4 versicolor
## 54 4.8 3.4 1.600 0.2 setosa
## 55 6.3 3.3 4.700 1.6 versicolor
## 56 7.7 3.8 6.700 2.2 virginica
## 57 5.1 3.8 1.500 0.3 setosa
## 58 NA 2.9 4.500 1.5 versicolor
## 59 6.4 2.8 5.600 NA virginica
## 60 6.4 2.8 5.600 2.1 virginica
## 61 5.0 2.3 3.300 NA versicolor
## 62 7.4 2.8 6.100 1.9 virginica
## 63 4.3 3.0 1.100 0.1 setosa
## 64 5.0 3.3 1.400 0.2 setosa
## 65 7.2 3.0 5.800 1.6 virginica
## 66 6.3 2.5 4.900 1.5 versicolor
## 67 5.1 2.5 NA 1.1 versicolor
## 68 NA 3.2 5.700 2.3 virginica
## 69 5.1 3.5 NA NA setosa
## 70 5.0 3.5 1.300 0.3 setosa
## 71 6.1 3.0 4.600 1.4 versicolor
## 72 6.9 3.1 5.100 2.3 virginica
## 73 5.1 3.5 1.400 0.3 setosa
## 74 6.5 NA 4.600 1.5 versicolor
## 75 5.6 2.8 4.900 2.0 virginica
## 76 4.9 2.5 4.500 NA virginica
## 77 5.5 3.5 1.300 0.2 setosa
## 78 7.6 3.0 6.600 2.1 virginica
## 79 5.1 3.8 0.000 0.2 setosa
## 80 7.9 3.8 6.400 2.0 virginica
## 81 6.1 2.6 5.600 1.4 virginica
## 82 5.4 3.4 1.700 0.2 setosa
## 83 6.1 2.9 4.700 1.4 versicolor
## 84 5.4 3.7 1.500 0.2 setosa
## 85 6.7 3.0 5.200 2.3 virginica
## 86 5.1 3.8 1.900 Inf setosa
## 87 6.4 2.9 4.300 1.3 versicolor
## 88 5.7 2.9 4.200 1.3 versicolor
## 89 4.4 2.9 1.400 0.2 setosa
## 90 6.3 2.5 5.000 1.9 virginica
## 91 7.2 3.2 6.000 1.8 virginica
## 92 4.9 NA 3.300 1.0 versicolor
## 93 5.2 3.4 1.400 0.2 setosa
## 94 5.8 2.7 5.100 1.9 virginica
## 95 6.0 2.2 5.000 1.5 virginica
## 96 6.9 3.1 NA 1.5 versicolor
## 97 5.5 2.3 4.000 1.3 versicolor
## 98 6.7 NA 5.000 1.7 versicolor
## 99 5.7 3.0 4.200 1.2 versicolor
## 100 6.3 2.8 5.100 1.5 virginica
## 101 5.4 3.4 1.500 0.4 setosa
## 102 7.2 3.6 NA 2.5 virginica
## 103 6.3 2.7 4.900 NA virginica
## 104 5.6 3.0 4.100 1.3 versicolor
## 105 5.1 3.7 NA 0.4 setosa
## 106 5.5 NA 0.925 1.0 versicolor
## 107 6.5 3.0 5.200 2.0 virginica
## 108 4.8 3.0 1.400 NA setosa
## 109 6.1 2.8 NA 1.3 versicolor
## 110 4.6 3.4 1.400 0.3 setosa
## 111 6.3 3.4 NA 2.4 virginica
## 112 5.0 3.4 1.500 0.2 setosa
## 113 5.1 3.4 1.500 0.2 setosa
## 114 NA 3.3 5.700 2.1 virginica
## 115 6.7 3.1 4.700 1.5 versicolor
## 116 7.7 2.6 6.900 2.3 virginica
## 117 6.3 NA 4.400 1.3 versicolor
## 118 4.6 3.1 1.500 0.2 setosa
## 119 NA 3.0 5.500 2.1 virginica
## 120 NA 2.8 4.700 1.2 versicolor
## 121 5.9 3.0 NA 1.5 versicolor
## 122 4.5 2.3 1.300 0.3 setosa
## 123 6.4 3.2 5.300 2.3 virginica
## 124 5.2 4.1 1.500 0.1 setosa
## 125 49.0 30.0 14.000 2.0 setosa
## 126 5.6 2.9 3.600 1.3 versicolor
## 127 6.8 3.2 5.900 2.3 virginica
## 128 5.8 NA 5.100 2.4 virginica
## 129 4.6 3.6 NA 0.2 setosa
## 130 5.7 0.0 1.700 0.3 setosa
## 131 5.6 2.5 3.900 1.1 versicolor
## 132 6.7 3.1 4.400 1.4 versicolor
## 133 4.8 NA 1.900 0.2 setosa
## 134 5.1 3.3 1.700 0.5 setosa
## 135 4.4 3.0 1.300 NA setosa
## 136 7.7 3.0 NA 2.3 virginica
## 137 4.7 3.2 1.600 0.2 setosa
## 138 NA 3.0 4.900 1.8 virginica
## 139 6.9 3.1 5.400 2.1 virginica
## 140 6.0 2.2 4.000 1.0 versicolor
## 141 5.0 NA 1.400 0.2 setosa
## 142 5.5 NA 3.800 1.1 versicolor
## 143 6.6 3.0 4.400 1.4 versicolor
## 144 6.3 2.9 5.600 1.8 virginica
## 145 5.7 2.5 5.000 2.0 virginica
## 146 6.7 3.1 5.600 2.4 virginica
## 147 5.6 3.0 4.500 1.5 versicolor
## 148 5.2 3.5 1.500 0.2 setosa
## 149 6.4 3.1 NA 1.8 virginica
## 150 5.8 2.6 4.000 NA versicolor
sum(complete.cases(mydata))
#calculates the total number of complete cases in the dataset mydata
nrow(mydata[complete.cases(mydata), ])/nrow(mydata)*100
#calculates the percentage of complete cases in the dataset mydata
is.special <- function(x){
if (is.numeric(x)) !is.finite(x) else is.na(x)
}
sapply(mydata, is.special)
#defines a function is.special to check for special values (NA or non-finite for numeric) and then applies this function to each column in the dataset mydata
for (n in colnames(mydata)){
is.na(mydata[[n]]) <- is.special(mydata[[n]])
}
summary(mydata)
#replaces NA and non-finite values with NA in each column of mydata. It then provides a summary of the dataset
vis_miss(airquality)
#create a visualization of missing values in the airquality dataset
gg_miss_upset(airquality)
#display patterns of missing values in the airquality dataset
gg_miss_upset(riskfactors)
#display patterns of missing values in the riskfactors dataset
# using regular geom_point()
ggplot(airquality,
aes(x = Ozone,
y = Solar.R)) +
geom_miss_point()
#create a scatter plot visualizing missing values in the 'Ozone' and 'Solar.R' variables in the airquality dataset
gg_miss_var(airquality)
#create a plot showing the distribution of missing values across variables in the airquality dataset
gg_miss_var(airquality) + labs(y = "Look at all the missing ones")
#modifies the previous plot by adding labels to make it more interpretable
gg_miss_case(airquality) + labs(x = "Number of Cases")
#Create a bar plot to visalize the number of missing values across cases in the airquality dataset
gg_miss_fct(x = riskfactors, fct = marital)
#create a plot visualizing missing values in the 'marital' factor variable in the riskfactors dataset
miss_var_summary(airquality)
#generate a summary of missing values in the airquality dataset
airquality %>%
group_by(Month) %>%
miss_var_summary()
#generate a summary of missing values in the airquality dataset grouped by the 'Month' variable
tooth <- as.data.frame(ToothGrowth)
tooth$dose <- as.factor(tooth$dose)
p<-ggplot(tooth, aes(x=dose, y=len, color=dose)) +
geom_boxplot()
p
#create a boxplot to identify outliers in the 'len' variable based on the 'dose' factor variable in the ToothGrowth dataset
tooth %>%
group_by(dose) %>%
identify_outliers(len)
#identify outliers in the 'len' variable grouped by the 'dose' factor variable in the ToothGrowth dataset
RULE <- editset(c("Sepal.Length <= 30","Species %in% c('setosa','versicolor','virginica')"
, "Sepal.Length > 0", "Sepal.Width > 0", "Petal.Length > 0", "Petal.Width > 0",
"Petal.Length >= 2 * Petal.Width", "Sepal.Length>Petal.Length"))
RULE
#define a set of rules (RULE) to check the consistency of data
summary(violatedEdits(RULE, mydata))
#summarize the violations of the defined rules in the dataset mydata
violated <- violatedEdits(RULE, mydata)
summary(violated)
#identify and summarize the violations of the defined rules in the dataset mydata
plot(violated)
#plot the violations of the defined rules in the dataset mydata after applying corrections
cr <- correctionRules(expression(
if (!is.na(Sepal.Width) && Sepal.Width <=0 ) Sepal.Width = NA
))
correctWithRules(cr, mydata)
# define correction rules to set Sepal.Width to NA if it is not NA and less than or equal to 0, and then applies these rules to the mydata dataset
mydata[localizeErrors(RULE, mydata)$adapt] <- NA
any(violatedEdits(RULE,mydata), na.rm=TRUE)
#set NA for adapted values based on rule violations in the mydata dataset and checks if there are still any rule violations
str(Carseats)
#display the structure of the Carseats dataset using the str function
carseats <- ISLR::Carseats
suppressWarnings(RNGversion("3.5.0"))
set.seed(123)
carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA
suppressWarnings(RNGversion("3.5.0"))
set.seed(456)
carseats[sample(seq(NROW(carseats)), 10), "Urban"] <- NA
#create the Carseats dataset and introduce missing values in the 'Income' and 'Urban' variables
income <- imputate_na(carseats, Income, US, method = "rpart")
summary(income)
#impute missing values in the 'Income' variable of the carseats dataset using the rpart method and display a summary of the imputation
plot(income) # vizualization of imputation
#visualize the imputation of missing values in the 'Income' variable of the carseats dataset
library(mice)
urban <- imputate_na(carseats, Urban, US, method = "mice")
#use the mice package to impute missing values in the 'Urban' variable of the carseats dataset
urban
# display result of imputation for the 'Urban' variable in the carseats dataset
summary(urban)
# summary of imputation for the 'Urban' variable in the carseats dataset
plot(urban)
# vizualization of imputation of missing values in the 'Urban' variable of the carseats dataset
carseats %>%
mutate(Income_minmax = transform(carseats$Income, method = "minmax"),
Sales_minmax = transform(carseats$Sales, method = "minmax")) %>%
select(Income_minmax, Sales_minmax) %>%
boxplot()
#create a boxplot for the standardized 'Income_minmax' and 'Sales_minmax' variables in the carseats dataset
bin <- binning(carseats$Income) # Binning the carat variable. default type argument is "quantile"
bin # Print bins class object
# create bins for the 'Income' variable in the carseats dataset using default quantile binning and prints the bins class object
summary(bin)
# Summarize bins class object created for the 'Income' variable
plot(bin)
# Plot bins class object created for the 'Income' variable
carseats %>%
mutate(Income_bin = binning(carseats$Income) %>%
extract()) %>%
group_by(ShelveLoc, Income_bin) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
head(10)
# create a frequency table for the 'ShelveLoc' and 'Income_bin' variables in the carseats dataset
income_fixed<- binning(carseats$Income, nbins = 4,
labels = c("low", "average", "high", "very high"))
summary(income_fixed)
#create fixed bins for the 'Income' variable in the carseats dataset with specified labels and display a summary of the result
plot(income_fixed)
# visualize the fixed bins for the 'Income' variable
bin <- binning_by(carseats, "US", "Advertising")
#create bins for the 'Advertising' variable
summary(bin)
#summary of the bins class object created for the 'Advertising' variable in the carseats dataset
plot(bin)
#plot the bins class object created for the 'Advertising' variable
Find out which observations have too long sepals using the result of violatedEdits.
# solution for the exercise 1 here ;-)'
# if (length(violated) > 0) {
# print(mydata[violated, ])
#}'
# the code is made as comment as Rstudio does not see "violated" during knitting
# The mean before and after the imputation of the Income variable
MICE <- imputate_na(carseats, Income, Education, method = "mice")
##
## iter imp variable
## 1 1 Income Urban
## 1 2 Income Urban
## 1 3 Income Urban
## 1 4 Income Urban
## 1 5 Income Urban
## 2 1 Income Urban
## 2 2 Income Urban
## 2 3 Income Urban
## 2 4 Income Urban
## 2 5 Income Urban
## 3 1 Income Urban
## 3 2 Income Urban
## 3 3 Income Urban
## 3 4 Income Urban
## 3 5 Income Urban
## 4 1 Income Urban
## 4 2 Income Urban
## 4 3 Income Urban
## 4 4 Income Urban
## 4 5 Income Urban
## 5 1 Income Urban
## 5 2 Income Urban
## 5 3 Income Urban
## 5 4 Income Urban
## 5 5 Income Urban
summary(MICE)
## * Impute missing values based on Multivariate Imputation by Chained Equations
## - method : mice
## - random seed : 37295
##
## * Information of Imputation (before vs after)
## Original Imputation
## described_variables "value" "value"
## n "380" "400"
## na "20" " 0"
## mean "68.86053" "68.98650"
## sd "28.09161" "27.52111"
## se_mean "1.441069" "1.376055"
## IQR "48.25" "45.25"
## skewness "0.04490600" "0.03761104"
## kurtosis "-1.089201" "-1.022505"
## p00 "21" "21"
## p01 "21.79" "21.99"
## p05 "26" "26"
## p10 "30.0" "30.9"
## p20 "39" "40"
## p25 "42.75" "44.75"
## p30 "48" "52"
## p40 "62.00" "62.36"
## p50 "69" "69"
## p60 "78.00" "77.32"
## p70 "86.3" "84.0"
## p75 "91" "90"
## p80 "96.2" "95.2"
## p90 "108.1" "106.1"
## p95 "115.05" "115.00"
## p99 "119.21" "119.01"
## p100 "120" "120"
MICE <- as.data.frame(MICE)
MICE
## MICE
## 1 73.0
## 2 48.0
## 3 35.0
## 4 100.0
## 5 64.0
## 6 113.0
## 7 105.0
## 8 81.0
## 9 110.0
## 10 113.0
## 11 78.0
## 12 94.0
## 13 35.0
## 14 28.0
## 15 117.0
## 16 95.0
## 17 66.2
## 18 62.6
## 19 110.0
## 20 76.0
## 21 90.0
## 22 29.0
## 23 46.0
## 24 31.0
## 25 119.0
## 26 32.0
## 27 115.0
## 28 118.0
## 29 74.0
## 30 99.0
## 31 94.0
## 32 58.0
## 33 32.0
## 34 38.0
## 35 54.0
## 36 84.0
## 37 76.0
## 38 41.0
## 39 73.0
## 40 70.8
## 41 98.0
## 42 53.0
## 43 69.0
## 44 42.0
## 45 79.0
## 46 63.0
## 47 90.0
## 48 98.0
## 49 52.0
## 50 93.0
## 51 32.0
## 52 90.0
## 53 40.0
## 54 64.0
## 55 103.0
## 56 81.0
## 57 82.0
## 58 91.0
## 59 93.0
## 60 71.0
## 61 102.0
## 62 32.0
## 63 45.0
## 64 88.0
## 65 67.0
## 66 26.0
## 67 92.0
## 68 61.0
## 69 69.0
## 70 59.0
## 71 81.0
## 72 51.0
## 73 45.0
## 74 90.0
## 75 68.0
## 76 111.0
## 77 87.0
## 78 71.0
## 79 48.0
## 80 67.0
## 81 100.0
## 82 72.0
## 83 83.0
## 84 36.0
## 85 25.0
## 86 103.0
## 87 84.0
## 88 67.0
## 89 42.0
## 90 66.0
## 91 22.0
## 92 46.0
## 93 113.0
## 94 30.0
## 95 53.8
## 96 25.0
## 97 42.0
## 98 82.0
## 99 77.0
## 100 47.0
## 101 69.0
## 102 93.0
## 103 22.0
## 104 91.0
## 105 96.0
## 106 100.0
## 107 33.0
## 108 107.0
## 109 79.0
## 110 65.0
## 111 62.0
## 112 118.0
## 113 99.0
## 114 29.0
## 115 87.0
## 116 60.8
## 117 75.0
## 118 53.0
## 119 88.0
## 120 94.0
## 121 105.0
## 122 89.0
## 123 100.0
## 124 103.0
## 125 113.0
## 126 88.2
## 127 68.0
## 128 48.0
## 129 100.0
## 130 120.0
## 131 84.0
## 132 69.0
## 133 87.0
## 134 98.0
## 135 31.0
## 136 94.0
## 137 75.0
## 138 42.0
## 139 103.0
## 140 62.0
## 141 60.0
## 142 42.0
## 143 84.0
## 144 88.0
## 145 68.0
## 146 63.0
## 147 83.0
## 148 54.0
## 149 119.0
## 150 120.0
## 151 84.0
## 152 58.0
## 153 78.0
## 154 36.0
## 155 69.0
## 156 72.0
## 157 34.0
## 158 58.0
## 159 90.0
## 160 60.0
## 161 28.0
## 162 21.0
## 163 80.4
## 164 64.0
## 165 64.0
## 166 58.0
## 167 67.0
## 168 73.0
## 169 89.0
## 170 41.0
## 171 39.0
## 172 106.0
## 173 102.0
## 174 91.0
## 175 24.0
## 176 89.0
## 177 77.8
## 178 72.0
## 179 95.0
## 180 25.0
## 181 112.0
## 182 83.0
## 183 60.0
## 184 74.0
## 185 33.0
## 186 100.0
## 187 51.0
## 188 32.0
## 189 37.0
## 190 117.0
## 191 37.0
## 192 42.0
## 193 26.0
## 194 70.0
## 195 98.0
## 196 93.0
## 197 28.0
## 198 61.0
## 199 80.0
## 200 88.0
## 201 92.0
## 202 83.0
## 203 78.0
## 204 82.0
## 205 80.0
## 206 22.0
## 207 67.0
## 208 105.0
## 209 100.2
## 210 21.0
## 211 41.0
## 212 118.0
## 213 69.0
## 214 84.0
## 215 115.0
## 216 83.0
## 217 71.2
## 218 44.0
## 219 61.0
## 220 79.0
## 221 120.0
## 222 72.2
## 223 119.0
## 224 45.0
## 225 82.0
## 226 25.0
## 227 33.0
## 228 64.0
## 229 73.0
## 230 104.0
## 231 60.0
## 232 69.0
## 233 80.0
## 234 76.0
## 235 62.0
## 236 32.0
## 237 34.0
## 238 28.0
## 239 24.0
## 240 105.0
## 241 80.0
## 242 63.0
## 243 46.0
## 244 25.0
## 245 30.0
## 246 43.0
## 247 56.0
## 248 114.0
## 249 52.0
## 250 67.0
## 251 105.0
## 252 111.0
## 253 97.0
## 254 24.0
## 255 104.0
## 256 81.0
## 257 40.0
## 258 62.0
## 259 38.0
## 260 36.0
## 261 117.0
## 262 42.0
## 263 60.4
## 264 26.0
## 265 29.0
## 266 35.0
## 267 93.0
## 268 82.0
## 269 57.0
## 270 69.0
## 271 26.0
## 272 56.0
## 273 33.0
## 274 106.0
## 275 93.0
## 276 119.0
## 277 69.0
## 278 48.0
## 279 113.0
## 280 57.0
## 281 86.0
## 282 69.0
## 283 96.0
## 284 110.0
## 285 46.0
## 286 26.0
## 287 118.0
## 288 44.0
## 289 40.0
## 290 77.0
## 291 111.0
## 292 70.0
## 293 66.0
## 294 84.0
## 295 76.0
## 296 35.0
## 297 44.0
## 298 83.0
## 299 63.0
## 300 40.0
## 301 78.0
## 302 93.0
## 303 77.0
## 304 52.0
## 305 98.0
## 306 29.0
## 307 32.0
## 308 92.0
## 309 80.0
## 310 111.0
## 311 65.0
## 312 68.0
## 313 117.0
## 314 81.0
## 315 79.4
## 316 21.0
## 317 36.0
## 318 30.0
## 319 72.0
## 320 45.0
## 321 70.0
## 322 39.0
## 323 50.0
## 324 105.0
## 325 65.0
## 326 69.0
## 327 30.0
## 328 38.0
## 329 66.0
## 330 54.0
## 331 59.0
## 332 63.0
## 333 33.0
## 334 60.0
## 335 117.0
## 336 70.0
## 337 35.0
## 338 38.0
## 339 24.0
## 340 44.0
## 341 29.0
## 342 120.0
## 343 102.0
## 344 42.0
## 345 80.0
## 346 68.0
## 347 65.0
## 348 39.0
## 349 102.0
## 350 27.0
## 351 68.8
## 352 115.0
## 353 103.0
## 354 67.0
## 355 31.0
## 356 100.0
## 357 109.0
## 358 73.0
## 359 96.0
## 360 62.0
## 361 86.0
## 362 25.0
## 363 55.0
## 364 69.8
## 365 21.0
## 366 30.0
## 367 56.0
## 368 106.0
## 369 22.0
## 370 100.0
## 371 41.0
## 372 81.0
## 373 60.2
## 374 71.4
## 375 47.0
## 376 46.0
## 377 60.0
## 378 61.0
## 379 88.0
## 380 111.0
## 381 64.0
## 382 65.0
## 383 28.0
## 384 117.0
## 385 37.0
## 386 73.0
## 387 116.0
## 388 73.0
## 389 89.0
## 390 42.0
## 391 75.0
## 392 63.0
## 393 42.0
## 394 51.0
## 395 58.0
## 396 108.0
## 397 53.4
## 398 26.0
## 399 79.0
## 400 37.0
#Mean is shown in the R console before and after imputation of the income variable
# solution for the exercise 4 here ;-)
Outers <- imputate_outlier(carseats, Price, method = "capping" )
summary(Outers)
## Impute outliers with capping
##
## * Information of Imputation (before vs after)
## Original Imputation
## described_variables "value" "value"
## n "400" "400"
## na "0" "0"
## mean "115.7950" "115.8928"
## sd "23.67666" "22.61092"
## se_mean "1.183833" "1.130546"
## IQR "31" "31"
## skewness "-0.1252862" "-0.0461621"
## kurtosis " 0.4518850" "-0.3030578"
## p00 "24" "54"
## p01 "54.99" "67.96"
## p05 "77" "77"
## p10 "87" "87"
## p20 "96.8" "96.8"
## p25 "100" "100"
## p30 "104" "104"
## p40 "110" "110"
## p50 "117" "117"
## p60 "122" "122"
## p70 "128.3" "128.3"
## p75 "131" "131"
## p80 "134" "134"
## p90 "146" "146"
## p95 "155.0500" "155.0025"
## p99 "166.05" "164.02"
## p100 "191" "173"
plot(Outers)
###
Exercise 5.
# solution for the exercise 5 here ;-)
binn <- binning(carseats$Income, nbins = 4, type = c("quantile"))
summary(binn)
## levels freq rate
## 1 [21,42.41667] 95 0.2375
## 2 (42.41667,69] 102 0.2550
## 3 (69,91] 89 0.2225
## 4 (91,120] 94 0.2350
## 5 <NA> 20 0.0500
plot(binn)