Cleaning Data
- Data Preparation
zmPDSwR-master/Custdata/exampleData.rData
를 필요로 함.
- working directory 정리
rm(list=ls())
ls()
## character(0)
load("../../zmPDSwR-master/Custdata//exampleData.rData")
ls()
## [1] "custdata" "hhdata" "medianincome"
str(custdata)
## 'data.frame': 1000 obs. of 19 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ age.normalized : num -0.461 1.341 -0.302 0.546 -0.779 ...
## $ Median.Income : num 52371 52371 52371 52371 52371 ...
## $ income.norm : num 1.566 0.936 0.134 0.71 1.337 ...
## $ gp : num 0.935 0.116 0.991 0.187 0.849 ...
## $ income.lt.30K : logi FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ age.range : Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
custdata.2 <- custdata
(v.to.add <- c("age.normalized", "Median.Income", "income.norm", "gp", "income.lt.30K", "age.range"))
## [1] "age.normalized" "Median.Income" "income.norm" "gp"
## [5] "income.lt.30K" "age.range"
#(index.to.add <- which(names(custdata) %in% v.to.add))
(v.to.retain <- setdiff(names(custdata), v.to.add))
## [1] "state.of.res" "custid" "sex"
## [4] "is.employed" "income" "marital.stat"
## [7] "health.ins" "housing.type" "recent.move"
## [10] "num.vehicles" "age" "is.employed.fix1"
## [13] "Income"
#custdata <- custdata[-index.to.add]
custdata <- custdata[v.to.retain]
str(custdata)
## 'data.frame': 1000 obs. of 13 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
Treating missing values
- Checking locations fo missing data
options(width=132)
summary(custdata[is.na(custdata$housing.type), c("recent.move", "num.vehicles")])
## recent.move num.vehicles
## Mode:logical Min. : NA
## NA's:56 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :56
str(custdata)
## 'data.frame': 1000 obs. of 13 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
summary(custdata[c("housing.type", "recent.move", "num.vehicles", "is.employed")])
## housing.type recent.move num.vehicles is.employed
## Homeowner free and clear :157 Mode :logical Min. :0.000 Mode :logical
## Homeowner with mortgage/loan:412 FALSE:820 1st Qu.:1.000 FALSE:73
## Occupied with no rent : 11 TRUE :124 Median :2.000 TRUE :599
## Rented :364 NA's :56 Mean :1.916 NA's :328
## NA's : 56 3rd Qu.:2.000
## Max. :6.000
## NA's :56
is.employed
의 NA를 missing
이라는 새로운 범주로 설정
custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed), "missing", ifelse(custdata$is.employed == TRUE, "employed", "not employed"))
summary(custdata$is.employed.fix)
## Length Class Mode
## 1000 character character
summary(factor(custdata$is.employed.fix))
## employed missing not employed
## 599 328 73
summary(as.factor(custdata$is.employed.fix))
## employed missing not employed
## 599 328 73
summary(factor(custdata$is.employed.fix, levels=c("employed", "not employed", "missing")))
## employed not employed missing
## 599 73 328
# summary(as.factor(custdata$is.employed.fix, levels=c("employed", "not employed", "missing")))
missing
의 성격 파악, not in the active workforce? (from the summary of age)
summary(custdata[custdata$is.employed.fix=="missing", ])
## state.of.res custid sex is.employed income marital.stat health.ins
## California : 43 Min. : 2068 F:172 Mode:logical Min. : 0 Divorced/Separated: 47 Mode :logical
## New York : 33 1st Qu.: 314974 M:156 NA's:328 1st Qu.: 1550 Married :145 FALSE:48
## Ohio : 21 Median : 623182 Median : 14450 Never Married : 56 TRUE :280
## Pennsylvania : 17 Mean : 684007 Mean : 27524 Widowed : 80 NA's :0
## Michigan : 15 3rd Qu.:1050329 3rd Qu.: 31650
## Massachusetts: 14 Max. :1412971 Max. :269000
## (Other) :185
## housing.type recent.move num.vehicles age is.employed.fix1 Income
## Homeowner free and clear :96 Mode :logical Min. :0.000 Min. : 0.00 Length:328 Min. : 0
## Homeowner with mortgage/loan:89 FALSE:257 1st Qu.:1.000 1st Qu.: 49.00 Class :character 1st Qu.: 24000
## Occupied with no rent : 3 TRUE :23 Median :2.000 Median : 67.00 Mode :character Median : 45000
## Rented :92 NA's :48 Mean :1.643 Mean : 63.22 Mean : 62990
## NA's :48 3rd Qu.:2.000 3rd Qu.: 78.00 3rd Qu.: 80000
## Max. :6.000 Max. :123.06 Max. :388000
## NA's :48 NA's :99
## is.employed.fix
## Length:328
## Class :character
## Mode :character
##
##
##
##
custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed), "not in active workforce", ifelse(custdata$is.employed == TRUE, "employed", "not employed"))
summary(factor(custdata$is.employed.fix))
## employed not employed not in active workforce
## 599 73 328
- Missing values is numeric data
summary(custdata$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 25000 45000 66200 82000 615000 328
mean.income <- mean(custdata$Income, na.rm = TRUE)
Income.fix <- ifelse(is.na(custdata$Income), mean.income, custdata$Income)
summary(Income.fix)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 35000 66200 66200 66200 615000
library(ggplot2)
ggplot(custdata, aes(x = Income)) + geom_histogram(binwidth=30000, aes(y = ..density..), alpha=0.5)

ggplot(data.frame(Income.fix), aes(x = Income.fix)) + geom_histogram(binwidth=30000, aes(y = ..density..), alpha=0.5)

ggplot(data.frame(Income.fix), aes(x = Income.fix)) + geom_bar(stat="bin", binwidth=30000, aes(y = ..density..), alpha=0.5)

Income.breaks <- c(0, 10000, 50000, 100000, 250000, 1000000)
Income.groups <- cut(custdata$Income, breaks = Income.breaks, include.lowest = TRUE)
summary(Income.groups)
## [0,1e+04] (1e+04,5e+04] (5e+04,1e+05] (1e+05,2.5e+05] (2.5e+05,1e+06] NA's
## 63 312 178 98 21 328
table(Income.groups, useNA = "ifany")
## Income.groups
## [0,1e+04] (1e+04,5e+04] (5e+04,1e+05] (1e+05,2.5e+05] (2.5e+05,1e+06] <NA>
## 63 312 178 98 21 328
str(Income.groups)
## Factor w/ 5 levels "[0,1e+04]","(1e+04,5e+04]",..: NA NA 1 2 2 4 4 2 NA 2 ...
Income.groups <- as.character(Income.groups)
Income.groups <- ifelse(is.na(Income.groups), "no income", Income.groups)
str(Income.groups)
## chr [1:1000] "no income" "no income" "[0,1e+04]" "(1e+04,5e+04]" "(1e+04,5e+04]" "(1e+05,2.5e+05]" ...
summary(Income.groups)
## Length Class Mode
## 1000 character character
summary(factor(Income.groups))
## (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,1e+06] (5e+04,1e+05] [0,1e+04] no income
## 312 98 21 178 63 328
table(Income.groups)
## Income.groups
## (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,1e+06] (5e+04,1e+05] [0,1e+04] no income
## 312 98 21 178 63 328
missing.Income <- is.na(custdata$Income)
Income.fix.2 <- ifelse(is.na(custdata$Income), 0, custdata$Income)
ggplot(data.frame(Income.fix), aes(x = Income.fix)) + geom_histogram(binwidth=30000, aes(y = ..density..), alpha=0.5)

ggplot(data.frame(Income.fix.2), aes(x = Income.fix.2)) + geom_histogram(binwidth=30000, aes(y = ..density..), alpha=0.5)

Data Transformation
str(custdata)
## 'data.frame': 1000 obs. of 14 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
## $ is.employed.fix : chr "employed" "not in active workforce" "not in active workforce" "not in active workforce" ...
str(medianincome)
## 'data.frame': 52 obs. of 2 variables:
## $ State : Factor w/ 52 levels "","Alabama","Alaska",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Median.Income: num 52371 44191 65720 48484 39832 ...
summary(medianincome)
## State Median.Income
## : 1 Min. :37427
## Alabama : 1 1st Qu.:47483
## Alaska : 1 Median :52274
## Arizona : 1 Mean :52655
## Arkansas : 1 3rd Qu.:57195
## California: 1 Max. :68187
## (Other) :46
custdata <- merge(custdata, medianincome, by.x = "state.of.res", by.y = "State")
str(custdata)
## 'data.frame': 1000 obs. of 15 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
## $ is.employed.fix : chr "employed" "not in active workforce" "not in active workforce" "not in active workforce" ...
## $ Median.Income : num 52371 52371 52371 52371 52371 ...
summary(custdata[, c("state.of.res", "income", "Median.Income")])
## state.of.res income Median.Income
## California :114 Min. : -8700 Min. :37427
## New York : 94 1st Qu.: 14600 1st Qu.:44819
## Pennsylvania: 63 Median : 35000 Median :50118
## Ohio : 59 Mean : 53505 Mean :50919
## Illinois : 52 3rd Qu.: 67000 3rd Qu.:55534
## Texas : 51 Max. :615000 Max. :68187
## (Other) :567
custdata$income.norm <- with(custdata, income/Median.Income)
summary(custdata$income.norm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1956 0.2812 0.6712 1.0780 1.3510 11.7900
- Converting continuous variables to discrete
custdata$income.lt.20K <- custdata$income < 20000
summary(custdata$income.lt.20K)
## Mode FALSE TRUE NA's
## logical 678 322 0
- Converting age into ranges
age.breaks <- c(0, 25, 65, Inf)
custdata$age.range <- cut(custdata$age, breaks = age.breaks, include.lowest = TRUE)
summary(custdata$age.range)
## [0,25] (25,65] (65,Inf]
## 56 732 212
str(custdata$age.range)
## Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
summary(custdata$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 38.0 50.0 51.7 64.0 146.7
mean.age <- mean(custdata$age)
custdata$age.normalized <- custdata$age/mean.age
summary(custdata$age.normalized)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.7350 0.9671 1.0000 1.2380 2.8370
summary(custdata$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 38.0 50.0 51.7 64.0 146.7
sd.age <- sd(custdata$age)
custdata$age.normalized <- (custdata$age - mean.age)/sd.age
summary(custdata$age.normalized)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.74100 -0.72630 -0.09011 0.00000 0.65210 5.03500
summary(scale(custdata$age))
## V1
## Min. :-2.74074
## 1st Qu.:-0.72626
## Median :-0.09011
## Mean : 0.00000
## 3rd Qu.: 0.65207
## Max. : 5.03516
- Figure 4.2
y = as.numeric(health.ins)
와 y = health.ins
라고 했을 때의 차이 유의.
ggplot(subset(custdata, custdata$income > 1000), aes(x = income, y = as.numeric(health.ins))) +
geom_point(alpha = 0.5, position = position_jitter(w = 0.05, h = 0.05)) +
geom_smooth() + scale_x_log10() + annotation_logticks(sides = "bt")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

- Figure 4.4
subset()
설정을 하지 않으면 어떻게 될까?
library(scales)
ggplot(custdata, aes(x = income)) + geom_density() + scale_x_continuous(labels = dollar)

ggplot(subset(custdata, custdata$income > 0), aes(x = log10(income))) + geom_density()

ggplot(subset(custdata, custdata$income > 0), aes(x = income)) + geom_density() +
scale_x_log10(breaks = c(100, 1000, 10000, 100000), labels = dollar) +
annotation_logticks(sides = "bt")

signed.log10 <- function(x) {
ifelse(abs(x) <= 1, 0, sign(x) * log10(abs(x)))
}
ggplot(custdata, aes(x = log10(income))) + geom_density() +
geom_density(aes(x = signed.log10(income)), linetype="dotted")
## Warning: NaN이 생성되었습니다
## Warning: NaN이 생성되었습니다
## Warning: Removed 79 rows containing non-finite values (stat_density).

dump("signed.log10", file="signed.log10.R")