#exam03) 주어진 데이터에서 빈값 또는 결측값들의 비율을 확인하고 가장 결측율이 높은
#변수명을 출력하시오
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
titanic<-read.csv("train100.csv")
titanic %>% glimpse
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1~
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3~
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl~
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal~
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, ~
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0~
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0~
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37~
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,~
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C~
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"~
summary(is.na(titanic))
## PassengerId Survived Pclass Name
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:891 FALSE:891 FALSE:891
##
## Sex Age SibSp Parch
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:714 FALSE:891 FALSE:891
## TRUE :177
## Ticket Fare Cabin Embarked
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:891 FALSE:891 FALSE:891 FALSE:891
##
colSums(is.na(titanic))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
# titanic data에서는 Age,Embarked 적용한다.
#① 변수 결측 또는 빈칸인 행수 확인 및 비율 산출
df<-nrow(titanic)
titanic %>% filter(is.na(Age)|Age=='') %>% summarize(n=n()) %>%
mutate(pct=n/df*100)->df1
# error summriaze(typing error),is.na=!is.na(error)
titanic %>% filter(is.na(Embarked)|Embarked=='') %>% summarize(n=n()) %>%
mutate(pct=n/df*100)->df2
# Embarked=(error)
df1
## n pct
## 1 177 19.86532
df2
## n pct
## 1 2 0.2244669
df3<-data.frame(df1,df2)
names(df3)<-c("total","Age","Embarked")
head(df3)
## total Age Embarked NA
## 1 177 19.86532 2 0.2244669
print(names(df3)[2])
## [1] "Age"