Raport 4

rm(list = ls())

mydata

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            6.4         3.2        4.500         1.5 versicolor
## 2            6.3         3.3        6.000         2.5  virginica
## 3            6.2          NA        5.400         2.3  virginica
## 4            5.0         3.4        1.600         0.4     setosa
## 5            5.7         2.6        3.500         1.0 versicolor
## 6            5.3          NA           NA         0.2     setosa
## 7            6.4         2.7        5.300          NA  virginica
## 8            5.9         3.0        5.100         1.8  virginica
## 9            5.8         2.7        4.100         1.0 versicolor
## 10           4.8         3.1        1.600         0.2     setosa
## 11           5.0         3.5        1.600         0.6     setosa
## 12           6.0         2.7        5.100         1.6 versicolor
## 13           6.0         3.0        4.800          NA  virginica
## 14           6.8         2.8        4.800         1.4 versicolor
## 15            NA         3.9        1.700         0.4     setosa
## 16           5.0        -3.0        3.500         1.0 versicolor
## 17           5.5          NA        4.000         1.3 versicolor
## 18           4.7         3.2        1.300         0.2     setosa
## 19            NA         4.0           NA         0.2     setosa
## 20           5.6          NA        4.200         1.3 versicolor
## 21           4.9         3.6           NA         0.1     setosa
## 22           5.4          NA        4.500         1.5 versicolor
## 23           6.2         2.8           NA         1.8  virginica
## 24           6.7         3.3        5.700         2.5  virginica
## 25            NA         3.0        5.900         2.1  virginica
## 26           4.6         3.2        1.400         0.2     setosa
## 27           4.9         3.1        1.500         0.1     setosa
## 28          73.0        29.0       63.000          NA  virginica
## 29           6.5         3.2        5.100         2.0  virginica
## 30            NA         2.8        0.820         1.3 versicolor
## 31           4.4         3.2           NA         0.2     setosa
## 32           5.9         3.2        4.800          NA versicolor
## 33           5.7         2.8        4.500         1.3 versicolor
## 34           6.2         2.9           NA         1.3 versicolor
## 35           6.6         2.9       23.000         1.3 versicolor
## 36           4.8         3.0        1.400         0.1     setosa
## 37           6.5         3.0        5.500         1.8  virginica
## 38           6.2         2.2        4.500         1.5 versicolor
## 39           6.7         2.5        5.800         1.8  virginica
## 40           5.0         3.0        1.600         0.2     setosa
## 41           5.0          NA        1.200         0.2     setosa
## 42           5.8         2.7        3.900         1.2 versicolor
## 43           0.0          NA        1.300         0.4     setosa
## 44           5.8         2.7        5.100         1.9  virginica
## 45           5.5         4.2        1.400         0.2     setosa
## 46           7.7         2.8        6.700         2.0  virginica
## 47           5.7          NA           NA         0.4     setosa
## 48           7.0         3.2        4.700         1.4 versicolor
## 49           6.5         3.0        5.800         2.2  virginica
## 50           6.0         3.4        4.500         1.6 versicolor
## 51           5.5         2.6        4.400         1.2 versicolor
## 52           4.9         3.1           NA         0.2     setosa
## 53           5.2         2.7        3.900         1.4 versicolor
## 54           4.8         3.4        1.600         0.2     setosa
## 55           6.3         3.3        4.700         1.6 versicolor
## 56           7.7         3.8        6.700         2.2  virginica
## 57           5.1         3.8        1.500         0.3     setosa
## 58            NA         2.9        4.500         1.5 versicolor
## 59           6.4         2.8        5.600          NA  virginica
## 60           6.4         2.8        5.600         2.1  virginica
## 61           5.0         2.3        3.300          NA versicolor
## 62           7.4         2.8        6.100         1.9  virginica
## 63           4.3         3.0        1.100         0.1     setosa
## 64           5.0         3.3        1.400         0.2     setosa
## 65           7.2         3.0        5.800         1.6  virginica
## 66           6.3         2.5        4.900         1.5 versicolor
## 67           5.1         2.5           NA         1.1 versicolor
## 68            NA         3.2        5.700         2.3  virginica
## 69           5.1         3.5           NA          NA     setosa
## 70           5.0         3.5        1.300         0.3     setosa
## 71           6.1         3.0        4.600         1.4 versicolor
## 72           6.9         3.1        5.100         2.3  virginica
## 73           5.1         3.5        1.400         0.3     setosa
## 74           6.5          NA        4.600         1.5 versicolor
## 75           5.6         2.8        4.900         2.0  virginica
## 76           4.9         2.5        4.500          NA  virginica
## 77           5.5         3.5        1.300         0.2     setosa
## 78           7.6         3.0        6.600         2.1  virginica
## 79           5.1         3.8        0.000         0.2     setosa
## 80           7.9         3.8        6.400         2.0  virginica
## 81           6.1         2.6        5.600         1.4  virginica
## 82           5.4         3.4        1.700         0.2     setosa
## 83           6.1         2.9        4.700         1.4 versicolor
## 84           5.4         3.7        1.500         0.2     setosa
## 85           6.7         3.0        5.200         2.3  virginica
## 86           5.1         3.8        1.900         Inf     setosa
## 87           6.4         2.9        4.300         1.3 versicolor
## 88           5.7         2.9        4.200         1.3 versicolor
## 89           4.4         2.9        1.400         0.2     setosa
## 90           6.3         2.5        5.000         1.9  virginica
## 91           7.2         3.2        6.000         1.8  virginica
## 92           4.9          NA        3.300         1.0 versicolor
## 93           5.2         3.4        1.400         0.2     setosa
## 94           5.8         2.7        5.100         1.9  virginica
## 95           6.0         2.2        5.000         1.5  virginica
## 96           6.9         3.1           NA         1.5 versicolor
## 97           5.5         2.3        4.000         1.3 versicolor
## 98           6.7          NA        5.000         1.7 versicolor
## 99           5.7         3.0        4.200         1.2 versicolor
## 100          6.3         2.8        5.100         1.5  virginica
## 101          5.4         3.4        1.500         0.4     setosa
## 102          7.2         3.6           NA         2.5  virginica
## 103          6.3         2.7        4.900          NA  virginica
## 104          5.6         3.0        4.100         1.3 versicolor
## 105          5.1         3.7           NA         0.4     setosa
## 106          5.5          NA        0.925         1.0 versicolor
## 107          6.5         3.0        5.200         2.0  virginica
## 108          4.8         3.0        1.400          NA     setosa
## 109          6.1         2.8           NA         1.3 versicolor
## 110          4.6         3.4        1.400         0.3     setosa
## 111          6.3         3.4           NA         2.4  virginica
## 112          5.0         3.4        1.500         0.2     setosa
## 113          5.1         3.4        1.500         0.2     setosa
## 114           NA         3.3        5.700         2.1  virginica
## 115          6.7         3.1        4.700         1.5 versicolor
## 116          7.7         2.6        6.900         2.3  virginica
## 117          6.3          NA        4.400         1.3 versicolor
## 118          4.6         3.1        1.500         0.2     setosa
## 119           NA         3.0        5.500         2.1  virginica
## 120           NA         2.8        4.700         1.2 versicolor
## 121          5.9         3.0           NA         1.5 versicolor
## 122          4.5         2.3        1.300         0.3     setosa
## 123          6.4         3.2        5.300         2.3  virginica
## 124          5.2         4.1        1.500         0.1     setosa
## 125         49.0        30.0       14.000         2.0     setosa
## 126          5.6         2.9        3.600         1.3 versicolor
## 127          6.8         3.2        5.900         2.3  virginica
## 128          5.8          NA        5.100         2.4  virginica
## 129          4.6         3.6           NA         0.2     setosa
## 130          5.7         0.0        1.700         0.3     setosa
## 131          5.6         2.5        3.900         1.1 versicolor
## 132          6.7         3.1        4.400         1.4 versicolor
## 133          4.8          NA        1.900         0.2     setosa
## 134          5.1         3.3        1.700         0.5     setosa
## 135          4.4         3.0        1.300          NA     setosa
## 136          7.7         3.0           NA         2.3  virginica
## 137          4.7         3.2        1.600         0.2     setosa
## 138           NA         3.0        4.900         1.8  virginica
## 139          6.9         3.1        5.400         2.1  virginica
## 140          6.0         2.2        4.000         1.0 versicolor
## 141          5.0          NA        1.400         0.2     setosa
## 142          5.5          NA        3.800         1.1 versicolor
## 143          6.6         3.0        4.400         1.4 versicolor
## 144          6.3         2.9        5.600         1.8  virginica
## 145          5.7         2.5        5.000         2.0  virginica
## 146          6.7         3.1        5.600         2.4  virginica
## 147          5.6         3.0        4.500         1.5 versicolor
## 148          5.2         3.5        1.500         0.2     setosa
## 149          6.4         3.1           NA         1.8  virginica
## 150          5.8         2.6        4.000          NA versicolor

Detecting NA

sum(complete.cases(mydata))
#calculates the total number of complete cases in the dataset mydata

nrow(mydata[complete.cases(mydata), ])/nrow(mydata)*100
#calculates the percentage of complete cases in the dataset mydata

is.special <- function(x){
  if (is.numeric(x)) !is.finite(x) else is.na(x)
}

sapply(mydata, is.special)
#defines a function is.special to check for special values (NA or non-finite for numeric) and then applies this function to each column in the dataset mydata

for (n in colnames(mydata)){
  is.na(mydata[[n]]) <- is.special(mydata[[n]])
}
summary(mydata)
#replaces NA and non-finite values with NA in each column of mydata. It then provides a summary of the dataset

Exploring NA’s

vis_miss(airquality)
#create a visualization of missing values in the airquality dataset

Patterns

gg_miss_upset(airquality)
#display patterns of missing values in the airquality dataset

gg_miss_upset(riskfactors)
#display patterns of missing values in the riskfactors dataset

NA’s Mechanisms

# using regular geom_point()
ggplot(airquality,
       aes(x = Ozone,
           y = Solar.R)) +
geom_miss_point()
#create a scatter plot visualizing missing values in the 'Ozone' and 'Solar.R' variables in the airquality dataset

NA’s in vars

gg_miss_var(airquality)
#create a plot showing the distribution of missing values across variables in the airquality dataset

gg_miss_var(airquality) + labs(y = "Look at all the missing ones")
#modifies the previous plot by adding labels to make it more interpretable

NA’s in cases

gg_miss_case(airquality) + labs(x = "Number of Cases")
#Create a bar plot to visalize the number of missing values across cases in the airquality dataset

NA’s across factors

gg_miss_fct(x = riskfactors, fct = marital)
#create a plot visualizing missing values in the 'marital' factor variable in the riskfactors dataset

Summaries for NA’s

miss_var_summary(airquality)
#generate a summary of missing values in the airquality dataset

airquality %>%
  group_by(Month) %>%
  miss_var_summary()
#generate a summary of missing values in the airquality dataset grouped by the 'Month' variable

Identify outliers

tooth <- as.data.frame(ToothGrowth)
tooth$dose <- as.factor(tooth$dose)
p<-ggplot(tooth, aes(x=dose, y=len, color=dose)) +
  geom_boxplot()
p
#create a boxplot to identify outliers in the 'len' variable based on the 'dose' factor variable in the ToothGrowth dataset

tooth %>%
  group_by(dose) %>%
  identify_outliers(len)
#identify outliers in the 'len' variable grouped by the 'dose' factor variable in the ToothGrowth dataset

Checking consistency

RULE <- editset(c("Sepal.Length <= 30","Species %in% c('setosa','versicolor','virginica')"
               , "Sepal.Length > 0", "Sepal.Width > 0", "Petal.Length > 0", "Petal.Width > 0",
"Petal.Length >= 2 * Petal.Width", "Sepal.Length>Petal.Length"))
RULE
#define a set of rules (RULE) to check the consistency of data

summary(violatedEdits(RULE, mydata))
#summarize the violations of the defined rules in the dataset mydata

violated <- violatedEdits(RULE, mydata)
summary(violated)
#identify and summarize the violations of the defined rules in the dataset mydata

plot(violated)
#plot the violations of the defined rules in the dataset mydata after applying corrections

Corrections

cr <- correctionRules(expression(
  if (!is.na(Sepal.Width) && Sepal.Width <=0 ) Sepal.Width = NA
  ))
correctWithRules(cr, mydata)
# define correction rules to set Sepal.Width to NA if it is not NA and less than or equal to 0, and then applies these rules to the mydata dataset

mydata[localizeErrors(RULE, mydata)$adapt] <- NA
any(violatedEdits(RULE,mydata), na.rm=TRUE)
#set NA for adapted values based on rule violations in the mydata dataset and checks if there are still any rule violations

dlookr package

str(Carseats)
#display the structure of the Carseats dataset using the str function

carseats <- ISLR::Carseats
suppressWarnings(RNGversion("3.5.0"))
set.seed(123)
carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA
suppressWarnings(RNGversion("3.5.0"))
set.seed(456)
carseats[sample(seq(NROW(carseats)), 10), "Urban"] <- NA
#create the Carseats dataset and introduce missing values in the 'Income' and 'Urban' variables

Imputations of NA’s

income <- imputate_na(carseats, Income, US, method = "rpart")
  summary(income)
#impute missing values in the 'Income' variable of the carseats dataset using the rpart method and display a summary of the imputation

plot(income)  # vizualization of imputation
#visualize the imputation of missing values in the 'Income' variable of the carseats dataset

library(mice)
urban <- imputate_na(carseats, Urban, US, method = "mice")
#use the mice package to impute missing values in the 'Urban' variable of the carseats dataset

urban 
# display result of imputation for the 'Urban' variable in the carseats dataset

summary(urban) 
# summary of imputation for the 'Urban' variable in the carseats dataset

plot(urban) 
# vizualization of imputation of missing values in the 'Urban' variable of the carseats dataset

Standardization

carseats %>% 
  mutate(Income_minmax = transform(carseats$Income, method = "minmax"),
    Sales_minmax = transform(carseats$Sales, method = "minmax")) %>% 
  select(Income_minmax, Sales_minmax) %>% 
  boxplot()
#create a boxplot for the standardized 'Income_minmax' and 'Sales_minmax' variables in the carseats dataset

Binning

bin <- binning(carseats$Income) # Binning the carat variable. default type argument is "quantile"
bin # Print bins class object
# create bins for the 'Income' variable in the carseats dataset using default quantile binning and prints the bins class object

summary(bin) 
# Summarize bins class object created for the 'Income' variable

plot(bin) 
# Plot bins class object created for the 'Income' variable

 carseats %>%
 mutate(Income_bin = binning(carseats$Income) %>% 
                     extract()) %>%
 group_by(ShelveLoc, Income_bin) %>%
 summarise(freq = n()) %>%
 arrange(desc(freq)) %>%
 head(10)
 # create a frequency table for the 'ShelveLoc' and 'Income_bin' variables in the carseats dataset

income_fixed<- binning(carseats$Income, nbins = 4,
                   labels = c("low", "average", "high", "very high"))
summary(income_fixed)
#create fixed bins for the 'Income' variable in the carseats dataset with specified labels and display a summary of the result

plot(income_fixed)
# visualize the fixed bins for the 'Income' variable

bin <- binning_by(carseats, "US", "Advertising")
#create bins for the 'Advertising' variable

summary(bin)
#summary of the bins class object created for the 'Advertising' variable in the carseats dataset

plot(bin)
#plot the bins class object created for the 'Advertising' variable

Exercise 1.

Find out which observations have too long sepals using the result of violatedEdits.

# solution for the exercise 1 here ;-)'
# if (length(violated) > 0) {
 # print(mydata[violated, ])
#}'
# the code is made as comment as Rstudio does not see "violated" during knitting

Exercise 3.

# The mean before and after the imputation of the Income variable
MICE <- imputate_na(carseats, Income, Education, method = "mice")

## 
##  iter imp variable
##   1   1  Income  Urban
##   1   2  Income  Urban
##   1   3  Income  Urban
##   1   4  Income  Urban
##   1   5  Income  Urban
##   2   1  Income  Urban
##   2   2  Income  Urban
##   2   3  Income  Urban
##   2   4  Income  Urban
##   2   5  Income  Urban
##   3   1  Income  Urban
##   3   2  Income  Urban
##   3   3  Income  Urban
##   3   4  Income  Urban
##   3   5  Income  Urban
##   4   1  Income  Urban
##   4   2  Income  Urban
##   4   3  Income  Urban
##   4   4  Income  Urban
##   4   5  Income  Urban
##   5   1  Income  Urban
##   5   2  Income  Urban
##   5   3  Income  Urban
##   5   4  Income  Urban
##   5   5  Income  Urban

summary(MICE)

## * Impute missing values based on Multivariate Imputation by Chained Equations
##  - method : mice
##  - random seed : 37295
## 
## * Information of Imputation (before vs after)
##                     Original     Imputation  
## described_variables "value"      "value"     
## n                   "380"        "400"       
## na                  "20"         " 0"        
## mean                "68.86053"   "68.98650"  
## sd                  "28.09161"   "27.52111"  
## se_mean             "1.441069"   "1.376055"  
## IQR                 "48.25"      "45.25"     
## skewness            "0.04490600" "0.03761104"
## kurtosis            "-1.089201"  "-1.022505" 
## p00                 "21"         "21"        
## p01                 "21.79"      "21.99"     
## p05                 "26"         "26"        
## p10                 "30.0"       "30.9"      
## p20                 "39"         "40"        
## p25                 "42.75"      "44.75"     
## p30                 "48"         "52"        
## p40                 "62.00"      "62.36"     
## p50                 "69"         "69"        
## p60                 "78.00"      "77.32"     
## p70                 "86.3"       "84.0"      
## p75                 "91"         "90"        
## p80                 "96.2"       "95.2"      
## p90                 "108.1"      "106.1"     
## p95                 "115.05"     "115.00"    
## p99                 "119.21"     "119.01"    
## p100                "120"        "120"

 MICE <- as.data.frame(MICE)
 MICE

##      MICE
## 1    73.0
## 2    48.0
## 3    35.0
## 4   100.0
## 5    64.0
## 6   113.0
## 7   105.0
## 8    81.0
## 9   110.0
## 10  113.0
## 11   78.0
## 12   94.0
## 13   35.0
## 14   28.0
## 15  117.0
## 16   95.0
## 17   66.2
## 18   62.6
## 19  110.0
## 20   76.0
## 21   90.0
## 22   29.0
## 23   46.0
## 24   31.0
## 25  119.0
## 26   32.0
## 27  115.0
## 28  118.0
## 29   74.0
## 30   99.0
## 31   94.0
## 32   58.0
## 33   32.0
## 34   38.0
## 35   54.0
## 36   84.0
## 37   76.0
## 38   41.0
## 39   73.0
## 40   70.8
## 41   98.0
## 42   53.0
## 43   69.0
## 44   42.0
## 45   79.0
## 46   63.0
## 47   90.0
## 48   98.0
## 49   52.0
## 50   93.0
## 51   32.0
## 52   90.0
## 53   40.0
## 54   64.0
## 55  103.0
## 56   81.0
## 57   82.0
## 58   91.0
## 59   93.0
## 60   71.0
## 61  102.0
## 62   32.0
## 63   45.0
## 64   88.0
## 65   67.0
## 66   26.0
## 67   92.0
## 68   61.0
## 69   69.0
## 70   59.0
## 71   81.0
## 72   51.0
## 73   45.0
## 74   90.0
## 75   68.0
## 76  111.0
## 77   87.0
## 78   71.0
## 79   48.0
## 80   67.0
## 81  100.0
## 82   72.0
## 83   83.0
## 84   36.0
## 85   25.0
## 86  103.0
## 87   84.0
## 88   67.0
## 89   42.0
## 90   66.0
## 91   22.0
## 92   46.0
## 93  113.0
## 94   30.0
## 95   53.8
## 96   25.0
## 97   42.0
## 98   82.0
## 99   77.0
## 100  47.0
## 101  69.0
## 102  93.0
## 103  22.0
## 104  91.0
## 105  96.0
## 106 100.0
## 107  33.0
## 108 107.0
## 109  79.0
## 110  65.0
## 111  62.0
## 112 118.0
## 113  99.0
## 114  29.0
## 115  87.0
## 116  60.8
## 117  75.0
## 118  53.0
## 119  88.0
## 120  94.0
## 121 105.0
## 122  89.0
## 123 100.0
## 124 103.0
## 125 113.0
## 126  88.2
## 127  68.0
## 128  48.0
## 129 100.0
## 130 120.0
## 131  84.0
## 132  69.0
## 133  87.0
## 134  98.0
## 135  31.0
## 136  94.0
## 137  75.0
## 138  42.0
## 139 103.0
## 140  62.0
## 141  60.0
## 142  42.0
## 143  84.0
## 144  88.0
## 145  68.0
## 146  63.0
## 147  83.0
## 148  54.0
## 149 119.0
## 150 120.0
## 151  84.0
## 152  58.0
## 153  78.0
## 154  36.0
## 155  69.0
## 156  72.0
## 157  34.0
## 158  58.0
## 159  90.0
## 160  60.0
## 161  28.0
## 162  21.0
## 163  80.4
## 164  64.0
## 165  64.0
## 166  58.0
## 167  67.0
## 168  73.0
## 169  89.0
## 170  41.0
## 171  39.0
## 172 106.0
## 173 102.0
## 174  91.0
## 175  24.0
## 176  89.0
## 177  77.8
## 178  72.0
## 179  95.0
## 180  25.0
## 181 112.0
## 182  83.0
## 183  60.0
## 184  74.0
## 185  33.0
## 186 100.0
## 187  51.0
## 188  32.0
## 189  37.0
## 190 117.0
## 191  37.0
## 192  42.0
## 193  26.0
## 194  70.0
## 195  98.0
## 196  93.0
## 197  28.0
## 198  61.0
## 199  80.0
## 200  88.0
## 201  92.0
## 202  83.0
## 203  78.0
## 204  82.0
## 205  80.0
## 206  22.0
## 207  67.0
## 208 105.0
## 209 100.2
## 210  21.0
## 211  41.0
## 212 118.0
## 213  69.0
## 214  84.0
## 215 115.0
## 216  83.0
## 217  71.2
## 218  44.0
## 219  61.0
## 220  79.0
## 221 120.0
## 222  72.2
## 223 119.0
## 224  45.0
## 225  82.0
## 226  25.0
## 227  33.0
## 228  64.0
## 229  73.0
## 230 104.0
## 231  60.0
## 232  69.0
## 233  80.0
## 234  76.0
## 235  62.0
## 236  32.0
## 237  34.0
## 238  28.0
## 239  24.0
## 240 105.0
## 241  80.0
## 242  63.0
## 243  46.0
## 244  25.0
## 245  30.0
## 246  43.0
## 247  56.0
## 248 114.0
## 249  52.0
## 250  67.0
## 251 105.0
## 252 111.0
## 253  97.0
## 254  24.0
## 255 104.0
## 256  81.0
## 257  40.0
## 258  62.0
## 259  38.0
## 260  36.0
## 261 117.0
## 262  42.0
## 263  60.4
## 264  26.0
## 265  29.0
## 266  35.0
## 267  93.0
## 268  82.0
## 269  57.0
## 270  69.0
## 271  26.0
## 272  56.0
## 273  33.0
## 274 106.0
## 275  93.0
## 276 119.0
## 277  69.0
## 278  48.0
## 279 113.0
## 280  57.0
## 281  86.0
## 282  69.0
## 283  96.0
## 284 110.0
## 285  46.0
## 286  26.0
## 287 118.0
## 288  44.0
## 289  40.0
## 290  77.0
## 291 111.0
## 292  70.0
## 293  66.0
## 294  84.0
## 295  76.0
## 296  35.0
## 297  44.0
## 298  83.0
## 299  63.0
## 300  40.0
## 301  78.0
## 302  93.0
## 303  77.0
## 304  52.0
## 305  98.0
## 306  29.0
## 307  32.0
## 308  92.0
## 309  80.0
## 310 111.0
## 311  65.0
## 312  68.0
## 313 117.0
## 314  81.0
## 315  79.4
## 316  21.0
## 317  36.0
## 318  30.0
## 319  72.0
## 320  45.0
## 321  70.0
## 322  39.0
## 323  50.0
## 324 105.0
## 325  65.0
## 326  69.0
## 327  30.0
## 328  38.0
## 329  66.0
## 330  54.0
## 331  59.0
## 332  63.0
## 333  33.0
## 334  60.0
## 335 117.0
## 336  70.0
## 337  35.0
## 338  38.0
## 339  24.0
## 340  44.0
## 341  29.0
## 342 120.0
## 343 102.0
## 344  42.0
## 345  80.0
## 346  68.0
## 347  65.0
## 348  39.0
## 349 102.0
## 350  27.0
## 351  68.8
## 352 115.0
## 353 103.0
## 354  67.0
## 355  31.0
## 356 100.0
## 357 109.0
## 358  73.0
## 359  96.0
## 360  62.0
## 361  86.0
## 362  25.0
## 363  55.0
## 364  69.8
## 365  21.0
## 366  30.0
## 367  56.0
## 368 106.0
## 369  22.0
## 370 100.0
## 371  41.0
## 372  81.0
## 373  60.2
## 374  71.4
## 375  47.0
## 376  46.0
## 377  60.0
## 378  61.0
## 379  88.0
## 380 111.0
## 381  64.0
## 382  65.0
## 383  28.0
## 384 117.0
## 385  37.0
## 386  73.0
## 387 116.0
## 388  73.0
## 389  89.0
## 390  42.0
## 391  75.0
## 392  63.0
## 393  42.0
## 394  51.0
## 395  58.0
## 396 108.0
## 397  53.4
## 398  26.0
## 399  79.0
## 400  37.0

 #Mean is shown in the R console before and after imputation of the income variable

Exercise 4.

# solution for the exercise 4 here ;-)
Outers <- imputate_outlier(carseats, Price, method = "capping" )
summary(Outers)

## Impute outliers with capping
## 
## * Information of Imputation (before vs after)
##                     Original     Imputation  
## described_variables "value"      "value"     
## n                   "400"        "400"       
## na                  "0"          "0"         
## mean                "115.7950"   "115.8928"  
## sd                  "23.67666"   "22.61092"  
## se_mean             "1.183833"   "1.130546"  
## IQR                 "31"         "31"        
## skewness            "-0.1252862" "-0.0461621"
## kurtosis            " 0.4518850" "-0.3030578"
## p00                 "24"         "54"        
## p01                 "54.99"      "67.96"     
## p05                 "77"         "77"        
## p10                 "87"         "87"        
## p20                 "96.8"       "96.8"      
## p25                 "100"        "100"       
## p30                 "104"        "104"       
## p40                 "110"        "110"       
## p50                 "117"        "117"       
## p60                 "122"        "122"       
## p70                 "128.3"      "128.3"     
## p75                 "131"        "131"       
## p80                 "134"        "134"       
## p90                 "146"        "146"       
## p95                 "155.0500"   "155.0025"  
## p99                 "166.05"     "164.02"    
## p100                "191"        "173"

  plot(Outers)

### Exercise 5.

# solution for the exercise 5 here ;-)
binn <- binning(carseats$Income, nbins = 4, type = c("quantile"))
summary(binn)

##          levels freq   rate
## 1 [21,42.41667]   95 0.2375
## 2 (42.41667,69]  102 0.2550
## 3       (69,91]   89 0.2225
## 4      (91,120]   94 0.2350
## 5          <NA>   20 0.0500

plot(binn)

Raport 4

Marta Szczerska, Oskar Rabażyński

2023-11-21

Detecting NA

Exploring NA’s

Patterns

NA’s Mechanisms

NA’s in vars

NA’s in cases

NA’s across factors

Summaries for NA’s

Identify outliers

Checking consistency

Corrections

dlookr package

Imputations of NA’s

Standardization

Binning

Exercise 1.

Exercise 3.

Exercise 4.