library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MASS)
##
## 다음의 패키지를 부착합니다: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
Cars93 <- Cars93[, 1:10] #변수는 그냥 10개만 쓰겠음
str(Cars93)
## 'data.frame': 93 obs. of 10 variables:
## $ Manufacturer: Factor w/ 32 levels "Acura","Audi",..: 1 1 2 2 3 4 4 4 4 5 ...
## $ Model : Factor w/ 93 levels "100","190E","240",..: 49 56 9 1 6 24 54 74 73 35 ...
## $ Type : Factor w/ 6 levels "Compact","Large",..: 4 3 1 3 3 3 2 2 3 2 ...
## $ Min.Price : num 12.9 29.2 25.9 30.8 23.7 14.2 19.9 22.6 26.3 33 ...
## $ Price : num 15.9 33.9 29.1 37.7 30 15.7 20.8 23.7 26.3 34.7 ...
## $ Max.Price : num 18.8 38.7 32.3 44.6 36.2 17.3 21.7 24.9 26.3 36.3 ...
## $ MPG.city : int 25 18 20 19 22 22 19 16 19 16 ...
## $ MPG.highway : int 31 25 26 26 30 31 28 25 27 25 ...
## $ AirBags : Factor w/ 3 levels "Driver & Passenger",..: 3 1 2 1 2 2 2 2 2 2 ...
## $ DriveTrain : Factor w/ 3 levels "4WD","Front",..: 2 2 2 2 3 2 2 3 2 2 ...
\(_\)
library(dlookr)
## Warning: 패키지 'dlookr'는 R 버전 4.1.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
\(_\)
Cars93 %>% dlookr::diagnose()
## # A tibble: 10 x 6
## variables types missing_count missing_percent unique_count unique_rate
## <chr> <chr> <int> <dbl> <int> <dbl>
## 1 Manufacturer factor 0 0 32 0.344
## 2 Model factor 0 0 93 1
## 3 Type factor 0 0 6 0.0645
## 4 Min.Price numeric 0 0 79 0.849
## 5 Price numeric 0 0 81 0.871
## 6 Max.Price numeric 0 0 79 0.849
## 7 MPG.city integer 0 0 21 0.226
## 8 MPG.highway integer 0 0 22 0.237
## 9 AirBags factor 0 0 3 0.0323
## 10 DriveTrain factor 0 0 3 0.0323
Cars93 %>% dlookr::diagnose(Manufacturer, Model, Type)
## # A tibble: 3 x 6
## variables types missing_count missing_percent unique_count unique_rate
## <chr> <chr> <int> <dbl> <int> <dbl>
## 1 Manufacturer factor 0 0 32 0.344
## 2 Model factor 0 0 93 1
## 3 Type factor 0 0 6 0.0645
\(_\)
Cars93 %>% dlookr::diagnose_numeric(Price, Max.Price, Min.Price)
## # A tibble: 3 x 10
## variables min Q1 mean median Q3 max zero minus outlier
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 Price 7.4 12.2 19.5 17.7 23.3 61.9 0 0 3
## 2 Max.Price 7.9 14.7 21.9 19.6 25.3 80 0 0 5
## 3 Min.Price 6.7 10.8 17.1 14.7 20.3 45.4 0 0 5
\(~\)
Cars93 %>% diagnose_outlier(Price, Max.Price, Min.Price) %>% filter(outliers_cnt > 0)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## Price Price 3 3.225806 49.96667 19.50968
## Max.Price Max.Price 5 5.376344 51.84000 21.89892
## Min.Price Min.Price 5 5.376344 39.20000 17.12581
## without_mean
## Price 18.49444
## Max.Price 20.19773
## Min.Price 15.87159
\(~\)
Cars93 %>% dlookr::diagnose_category(Manufacturer)
## # A tibble: 10 x 6
## variables levels N freq ratio rank
## <chr> <chr> <int> <int> <dbl> <int>
## 1 Manufacturer Chevrolet 93 8 8.60 1
## 2 Manufacturer Ford 93 8 8.60 1
## 3 Manufacturer Dodge 93 6 6.45 3
## 4 Manufacturer Mazda 93 5 5.38 4
## 5 Manufacturer Pontiac 93 5 5.38 4
## 6 Manufacturer Buick 93 4 4.30 6
## 7 Manufacturer Hyundai 93 4 4.30 6
## 8 Manufacturer Nissan 93 4 4.30 6
## 9 Manufacturer Oldsmobile 93 4 4.30 6
## 10 Manufacturer Toyota 93 4 4.30 6
\(~\)
Cars93 %>% dlookr::describe()
## # A tibble: 5 x 26
## variable n na mean sd se_mean IQR skewness kurtosis p00 p01
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Min.Price 93 0 17.1 8.75 0.907 9.5 1.18 1.02 6.7 6.79
## 2 Price 93 0 19.5 9.66 1.00 11.1 1.53 3.43 7.4 7.95
## 3 Max.Price 93 0 21.9 11.0 1.14 10.6 2.03 7.44 7.9 9.00
## 4 MPG.city 93 0 22.4 5.62 0.583 7 1.70 4.00 15 15
## 5 MPG.highw~ 93 0 29.1 5.33 0.553 5 1.23 2.61 20 20
## # ... with 15 more variables: p05 <dbl>, p10 <dbl>, p20 <dbl>, p25 <dbl>,
## # p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>, p75 <dbl>,
## # p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
Cars93 %>% dlookr::describe(Price)
## # A tibble: 1 x 26
## variable n na mean sd se_mean IQR skewness kurtosis p00 p01
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Price 93 0 19.5 9.66 1.00 11.1 1.53 3.43 7.4 7.95
## # ... with 15 more variables: p05 <dbl>, p10 <dbl>, p20 <dbl>, p25 <dbl>,
## # p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>, p75 <dbl>,
## # p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
\(_\)
Cars93 %>%
dlookr::diagnose() %>%
dplyr::filter(missing_count > 0) %>% #여기서 missing_count는 앞서 diagnose명령어를 통해 뽑아낸 것
dplyr::arrange(desc(missing_count))
## # A tibble: 0 x 6
## # ... with 6 variables: variables <chr>, types <chr>, missing_count <int>,
## # missing_percent <dbl>, unique_count <int>, unique_rate <dbl>
Cars93 %>%
dplyr::group_by(Manufacturer) %>%
dlookr::describe(Price, Min.Price, Max.Price)
## # A tibble: 96 x 27
## variable Manufacturer n na mean sd se_mean IQR skewness kurtosis
## <chr> <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Max.Pri~ Acura 2 0 28.8 14.1 9.95 9.95 NA NA
## 2 Max.Pri~ Audi 2 0 38.4 8.70 6.15 6.15 NA NA
## 3 Max.Pri~ BMW 1 0 36.2 NA NA 0 NA NA
## 4 Max.Pri~ Buick 4 0 22.6 3.99 2.00 4.65 -0.833 -0.559
## 5 Max.Pri~ Cadillac 2 0 39.5 4.53 3.20 3.20 NA NA
## 6 Max.Pri~ Chevrolet 8 0 20.3 8.92 3.15 1.15 2.34 6.34
## 7 Max.Pri~ Chrylser 1 0 18.4 NA NA 0 NA NA
## 8 Max.Pri~ Chrysler 2 0 23.3 8.77 6.2 6.20 NA NA
## 9 Max.Pri~ Dodge 6 0 18.9 8.33 3.40 8.07 1.18 0.613
## 10 Max.Pri~ Eagle 2 0 18.8 3.32 2.35 2.35 NA NA
## # ... with 86 more rows, and 17 more variables: p00 <dbl>, p01 <dbl>,
## # p05 <dbl>, p10 <dbl>, p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>,
## # p50 <dbl>, p60 <dbl>, p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>,
## # p95 <dbl>, p99 <dbl>, p100 <dbl>