# Practical Data Science with R by Nina Zumel
#chapter 4
#Managing data
# This chapter covers
# ??? Fixing data quality problems
# ??? Organizing your data for the
# modeling process
setwd("C:\\Users\\Luis\\Documents\\practical_data_science\\zmPDSwR-master\\zmPDSwR-master\\Custdata")
custdata <- read.table("C:\\Users\\Luis\\Documents\\practical_data_science\\zmPDSwR-master\\zmPDSwR-master\\Custdata\\custdata.tsv",
header=TRUE,sep="\t")
names(custdata)
## [1] "custid" "sex" "is.employed" "income"
## [5] "marital.stat" "health.ins" "housing.type" "recent.move"
## [9] "num.vehicles" "age" "state.of.res"
summary(custdata)
## custid sex is.employed income
## Min. : 2068 F:440 Mode :logical Min. : -8700
## 1st Qu.: 345667 M:560 FALSE:73 1st Qu.: 14600
## Median : 693403 TRUE :599 Median : 35000
## Mean : 698500 NA's :328 Mean : 53505
## 3rd Qu.:1044606 3rd Qu.: 67000
## Max. :1414286 Max. :615000
##
## marital.stat health.ins
## Divorced/Separated:155 Mode :logical
## Married :516 FALSE:159
## Never Married :233 TRUE :841
## Widowed : 96 NA's :0
##
##
##
## housing.type recent.move num.vehicles
## Homeowner free and clear :157 Mode :logical Min. :0.000
## Homeowner with mortgage/loan:412 FALSE:820 1st Qu.:1.000
## Occupied with no rent : 11 TRUE :124 Median :2.000
## Rented :364 NA's :56 Mean :1.916
## NA's : 56 3rd Qu.:2.000
## Max. :6.000
## NA's :56
## age state.of.res
## Min. : 0.0 California :100
## 1st Qu.: 38.0 New York : 71
## Median : 50.0 Pennsylvania: 70
## Mean : 51.7 Texas : 56
## 3rd Qu.: 64.0 Michigan : 52
## Max. :146.7 Ohio : 51
## (Other) :600
str(custdata)
## 'data.frame': 1000 obs. of 11 variables:
## $ custid : int 2068 2073 2848 5641 6369 8322 8521 12195 14989 15917 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 2 2 1 1 2 2 2 1 ...
## $ is.employed : logi NA NA TRUE TRUE TRUE TRUE ...
## $ income : int 11300 0 4500 20000 12000 180000 120000 40000 9400 24000 ...
## $ marital.stat: Factor w/ 4 levels "Divorced/Separated",..: 2 2 3 3 3 3 3 2 2 1 ...
## $ health.ins : logi TRUE TRUE FALSE FALSE TRUE TRUE ...
## $ housing.type: Factor w/ 4 levels "Homeowner free and clear",..: 1 4 4 3 4 2 1 4 4 1 ...
## $ recent.move : logi FALSE TRUE TRUE FALSE TRUE FALSE ...
## $ num.vehicles: int 2 3 3 0 1 1 1 3 2 1 ...
## $ age : num 49 40 22 22 31 40 39 48 44 70 ...
## $ state.of.res: Factor w/ 50 levels "Alabama","Alaska",..: 22 9 10 31 9 32 12 22 13 33 ...
summary(custdata[is.na(custdata$housing.type),
c("recent.move","num.vehicles")])#Restrict to the rows
## recent.move num.vehicles
## Mode:logical Min. : NA
## NA's:56 1st Qu.: NA
## Median : NA
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :56
#where housing.type is NA
# These variables are only missing a few values. It's probably
# safe to just drop the rows that are missing values-especially
# if the missing values are all in the same 56 rows
# is.employed? Here you're missing data from a third of the customers. What do you
# do then?
# MISSING DATA IN CATEGORICAL VARIABLES
# The most straightforward solution is just to create a new category for the variable,
# called missing.
custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed),"missing",
ifelse(custdata$is.employed==TRUE,"employed",
"not employed"))
summary(as.factor(custdata$is.employed.fix))
## employed missing not employed
## 599 328 73
custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed),
"not in active workforce",
ifelse(custdata$is.employed==TRUE,
"employed",
"not employed"))
summary(as.factor(custdata$is.employed.fix))
## employed not employed not in active workforce
## 599 73 328
# MISSING VALUES IN NUMERIC DATA
# Suppose your income variable is missing substantial data:
summary(custdata$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8700 14600 35000 53500 67000 615000
breaks <- c(0,10000,50000,100000,250000,500000,1000000)
income.groups <- cut(custdata$income,breaks=breaks,include.lowest = TRUE)
summary(income.groups)
## [0,1e+04] (1e+04,5e+04] (5e+04,1e+05] (1e+05,2.5e+05]
## 184 469 215 105
## (2.5e+05,5e+05] (5e+05,1e+06] NA's
## 25 1 1
income.groups <- as.character(income.groups)
income.groups <- ifelse(is.na(income.groups),"no income",income.groups)
summary(as.factor(income.groups))
## (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,5e+05] (5e+04,1e+05]
## 469 105 25 215
## (5e+05,1e+06] [0,1e+04] no income
## 1 184 1
aggregate(custdata$income~custdata$state.of.res,mean,data=custdata)
## custdata$state.of.res custdata$income
## 1 Alabama 23018.18
## 2 Alaska 54690.00
## 3 Arizona 49455.56
## 4 Arkansas 74957.14
## 5 California 53263.90
## 6 Colorado 52100.09
## 7 Connecticut 43035.71
## 8 Delaware 94700.00
## 9 Florida 49231.02
## 10 Georgia 28434.07
## 11 Hawaii 56640.00
## 12 Idaho 53800.00
## 13 Illinois 62920.08
## 14 Indiana 61765.17
## 15 Iowa 40360.00
## 16 Kansas 38475.00
## 17 Kentucky 34332.50
## 18 Louisiana 91756.67
## 19 Maine 117416.67
## 20 Maryland 45844.12
## 21 Massachusetts 46972.50
## 22 Michigan 46511.35
## 23 Minnesota 72731.90
## 24 Mississippi 16714.29
## 25 Missouri 79145.71
## 26 Montana 9500.00
## 27 Nebraska 39825.00
## 28 Nevada 64500.00
## 29 New Hampshire 44010.00
## 30 New Jersey 60146.41
## 31 New Mexico 38466.67
## 32 New York 58183.39
## 33 North Carolina 56550.00
## 34 North Dakota 0.00
## 35 Ohio 55840.98
## 36 Oklahoma 49863.64
## 37 Oregon 48457.14
## 38 Pennsylvania 54437.57
## 39 Rhode Island 59550.00
## 40 South Carolina 58106.00
## 41 South Dakota 66700.00
## 42 Tennessee 45791.82
## 43 Texas 55959.64
## 44 Utah 28440.00
## 45 Vermont 24666.67
## 46 Virginia 56792.07
## 47 Washington 60548.24
## 48 West Virginia 30725.38
## 49 Wisconsin 49926.61
## 50 Wyoming 10000.00
sum(sample(custdata$health.ins==TRUE))
## [1] 841
summary(custdata$health.ins)
## Mode FALSE TRUE NA's
## logical 159 841 0
# objects(grep("faraway",search()
library(faraway)
data("fruitfly")
plot(fruitfly$longevity~fruitfly$activity)
aggregate(fruitfly$longevity~fruitfly$activity,mean,data = fruitfly)
## fruitfly$activity fruitfly$longevity
## 1 isolated 63.56000
## 2 one 64.80000
## 3 low 56.76000
## 4 many 64.54167
## 5 high 38.72000
# K Means Clustering in R
# by Teja Kodali
library(ggplot2)

ggplot(iris,aes(iris$Petal.Length,
iris$Petal.Width,
color=iris$Species))+geom_point()

head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
set.seed(20)
irisCluster <- kmeans(iris[,3:4],3,nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 50, 52, 48
##
## Cluster means:
## Petal.Length Petal.Width
## 1 1.462000 0.246000
## 2 4.269231 1.342308
## 3 5.595833 2.037500
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3
## [141] 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2.02200 13.05769 16.29167
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(irisCluster$cluster,iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 4
## 3 0 2 46
irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris,aes(iris$Petal.Length,iris$Petal.Width,
color=irisCluster$cluster))+
geom_point()
