# 결측치와 이상치 처리
# 01 결측치
# 데이터에서 값이 비어 있는 상태를 의미
rm(list=ls())
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data('airquality')
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
# R에서는  NA 결측값을 의미합니다.
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
# is.na() 결측값을 TRUE로 출력합니다.
table(is.na(airquality))
## 
## FALSE  TRUE 
##   874    44
# 결측값이 존재하면 연산을 할 수가 없습니다.
# sum(airquality$Ozone)
# mean(airquality$Ozone)
sum(airquality$Ozone,na.rm=TRUE)
## [1] 4887
mean(airquality$Ozone,na.rm=TRUE)
## [1] 42.12931
# na.rm=TRUE: 결측값을 제외해주세요

# names(airquality) <-tolower(names(airquality))
# is.na(airquality$Ozone)
# table(is.na(airquality))
# table(is.na(airquality$Ozone))
# summary(is.na(airquality))
# sum(airquality$Ozone)
# mean(airquality$Ozone)
# sum(airquality$Ozone,na.rm=TRUE)
# mean(airquality$Ozone,na.rm=TRUE)

# 상자그림(boxplot)은 데이터의 분포와 이상치를 한눈에
# 보여주는 시각화 도구입니다
# 중앙값, 사분위수, 범위, 이상치 등을 확인하는 데 널리 사용
data(iris)

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
boxplot(iris$Petal.Length~iris$Species,data=iris)

data("ChickWeight")
summary(ChickWeight)
##      weight           Time           Chick     Diet   
##  Min.   : 35.0   Min.   : 0.00   13     : 12   1:220  
##  1st Qu.: 63.0   1st Qu.: 4.00   9      : 12   2:120  
##  Median :103.0   Median :10.00   20     : 12   3:120  
##  Mean   :121.8   Mean   :10.72   10     : 12   4:118  
##  3rd Qu.:163.8   3rd Qu.:16.00   17     : 12          
##  Max.   :373.0   Max.   :21.00   19     : 12          
##                                  (Other):506
boxplot(ChickWeight$weight~ChickWeight$Diet,data=ChickWeight)

data("chickwts")
summary(chickwts)
##      weight             feed   
##  Min.   :108.0   casein   :12  
##  1st Qu.:204.5   horsebean:10  
##  Median :258.0   linseed  :12  
##  Mean   :261.3   meatmeal :11  
##  3rd Qu.:323.5   soybean  :14  
##  Max.   :423.0   sunflower:12
boxplot(chickwts$weight~chickwts$feed,data=chickwts)

hist(ChickWeight$weight)

# weight 변수의 경우 평균(mean)>중앙값(median)이므로 오른쪽 꼬리 분포이다
# 이상값 탐색할 수 있다.
# 전체 관측치 수를 파악할 수 없다
# 결측치의 수를 확인할 수 없다.( 상자그림은 결측치를 제외하고 분석한다.)

density(ChickWeight$weight)
## 
## Call:
##  density.default(x = ChickWeight$weight)
## 
## Data: ChickWeight$weight (578 obs.); Bandwidth 'bw' = 17.93
## 
##        x                y            
##  Min.   :-18.79   Min.   :4.765e-07  
##  1st Qu.: 92.61   1st Qu.:2.908e-04  
##  Median :204.00   Median :1.326e-03  
##  Mean   :204.00   Mean   :2.240e-03  
##  3rd Qu.:315.39   3rd Qu.:3.976e-03  
##  Max.   :426.79   Max.   :7.323e-03
ott7<-data.frame(gender = c('1','1','2','2','2','3'),
                 income = c(200,250,200,300,200,150))

ott7
##   gender income
## 1      1    200
## 2      1    250
## 3      2    200
## 4      2    300
## 5      2    200
## 6      3    150
table(ott7$gender)
## 
## 1 2 3 
## 2 3 1