data("airquality")
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 
names(airquality) <- tolower(names(airquality))
names(airquality)
## [1] "ozone"   "solar.r" "wind"    "temp"    "month"   "day"
is.na(airquality$ozone)
##   [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [37]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE
##  [49] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [61]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [73] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
table(is.na(airquality)) #데이터셋의 결측치 전체 빈도 구하기
## 
## FALSE  TRUE 
##   874    44
table(is.na(airquality$ozone))
## 
## FALSE  TRUE 
##   116    37
summary(is.na(airquality))
##    ozone          solar.r           wind            temp        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:116       FALSE:146       FALSE:153       FALSE:153      
##  TRUE :37        TRUE :7                                        
##    month            day         
##  Mode :logical   Mode :logical  
##  FALSE:153       FALSE:153      
## 
sum(airquality$ozone)
## [1] NA
mean(airquality$ozone)
## [1] NA
sum(airquality$ozone, na.rm = TRUE)
## [1] 4887
mean(airquality$ozone, na.rm = TRUE)
## [1] 42.12931
airquality<-na.omit(airquality)
table(is.na(airquality))
## 
## FALSE 
##   666
airquality %>% filter(!is.na(ozone)) %>% head(3)
##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
airquality %>% filter(!is.na(ozone)&!is.na(solar.r)) %>% head
##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    23     299  8.6   65     5   7
## 6    19      99 13.8   59     5   8
mean(airquality$ozone, na.rm=TRUE)
## [1] 42.0991
airquality$ozone<-ifelse(is.na(airquality$ozone), 42.0991, airquality$ozone)
table (is.na(airquality$ozone))
## 
## FALSE 
##   111
ott7<-data.frame(gender=c("1","1","2","2","2","3"),
                 income=c(200,250,200,300,200,150))
ott7
##   gender income
## 1      1    200
## 2      1    250
## 3      2    200
## 4      2    300
## 5      2    200
## 6      3    150
# 상자그림을 통한 이상값 확인하기
boxplot(iris$Sepal.Width)$stats

##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
#IQR = Q3 - Q1
#밑 끝선 Q1 - 1.5 X IQR
#위 끝선 Q3 + 1.5 X IQR