# exam1) 주어진 캘리포니아 주택 데이터 첫번째 행부터 순서대로 80%까지의 데이터를 추출한후
# 'total_bedrooms'변수의 결측값(NA)을 'total_bedrooms'변수의 중앙값으로 대체하고
# 대체전의 'total_bedrooms' 변수의 표준편차값과 대체후에 표준편차의 차이를 구하시오
library(dplyr)
## Warning: 패키지 'dplyr'는 R 버전 4.1.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
house<-read.csv("housing.csv")
# ① 주어진 데이터 첫번째 행부터 순서대로 80%추출
nrow(house)
## [1] 20640
rownum<-nrow(house)*0.8
rownum
## [1] 16512
house1<-house[1:rownum,]
# ② 데이터 구조 확인
house1 %>% glimpse
## Rows: 16,512
## Columns: 10
## $ longitude <dbl> -122.23, -122.22, -122.24, -122.25, -122.25, -122.2~
## $ latitude <dbl> 37.88, 37.86, 37.85, 37.85, 37.85, 37.85, 37.84, 37~
## $ housing_median_age <int> 41, 21, 52, 52, 52, 52, 52, 52, 42, 52, 52, 52, 52,~
## $ total_rooms <int> 880, 7099, 1467, 1274, 1627, 919, 2535, 3104, 2555,~
## $ total_bedrooms <int> 129, 1106, 190, 235, 280, 213, 489, 687, 665, 707, ~
## $ population <int> 322, 2401, 496, 558, 565, 413, 1094, 1157, 1206, 15~
## $ households <int> 126, 1138, 177, 219, 259, 193, 514, 647, 595, 714, ~
## $ median_income <dbl> 8.3252, 8.3014, 7.2574, 5.6431, 3.8462, 4.0368, 3.6~
## $ median_house_value <int> 452600, 358500, 352100, 341300, 342200, 269700, 299~
## $ ocean_proximity <chr> "NEAR BAY", "NEAR BAY", "NEAR BAY", "NEAR BAY", "NE~
# ③ 결측치(NA) 확인
colSums(is.na(house1))
## longitude latitude housing_median_age total_rooms
## 0 0 0 0
## total_bedrooms population households median_income
## 159 0 0 0
## median_house_value ocean_proximity
## 0 0
summary(is.na(house1))
## longitude latitude housing_median_age total_rooms
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:16512 FALSE:16512 FALSE:16512 FALSE:16512
##
## total_bedrooms population households median_income
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:16353 FALSE:16512 FALSE:16512 FALSE:16512
## TRUE :159
## median_house_value ocean_proximity
## Mode :logical Mode :logical
## FALSE:16512 FALSE:16512
##
# ④ 결측치 대체 전 표준편차 구하기
df1<-sd(house1$total_bedrooms,na.rm=TRUE)
df1
## [1] 435.9006
# ⑤ 결측치를 중앙값을 대체
df2<-median(house1$total_bedrooms,na.rm=TRUE)
df2
## [1] 436
house1$total_bedrooms<-ifelse(is.na(house1$total_bedrooms),df2,house1$total_bedrooms)
# 절대 혼동하지 마세요 house1$total_bedrooms==NA
# house1 %>% filter(!is.na(total_bedrooms))
colSums(is.na(house1))
## longitude latitude housing_median_age total_rooms
## 0 0 0 0
## total_bedrooms population households median_income
## 0 0 0 0
## median_house_value ocean_proximity
## 0 0
# ⑥ 결측치 대체후 표준편차 구하기
df3<-sd(house1$total_bedrooms)
df3
## [1] 433.9254
df1-df3
## [1] 1.975147