chapter

# Practical Data Science with R by Nina Zumel
#chapter 4
#Managing data
# This chapter covers
# ??? Fixing data quality problems
# ??? Organizing your data for the
# modeling process
setwd("C:\\Users\\Luis\\Documents\\practical_data_science\\zmPDSwR-master\\zmPDSwR-master\\Custdata")
custdata <- read.table("C:\\Users\\Luis\\Documents\\practical_data_science\\zmPDSwR-master\\zmPDSwR-master\\Custdata\\custdata.tsv",
                       header=TRUE,sep="\t")
names(custdata)

##  [1] "custid"       "sex"          "is.employed"  "income"      
##  [5] "marital.stat" "health.ins"   "housing.type" "recent.move" 
##  [9] "num.vehicles" "age"          "state.of.res"

summary(custdata)

##      custid        sex     is.employed         income      
##  Min.   :   2068   F:440   Mode :logical   Min.   : -8700  
##  1st Qu.: 345667   M:560   FALSE:73        1st Qu.: 14600  
##  Median : 693403           TRUE :599       Median : 35000  
##  Mean   : 698500           NA's :328       Mean   : 53505  
##  3rd Qu.:1044606                           3rd Qu.: 67000  
##  Max.   :1414286                           Max.   :615000  
##                                                            
##              marital.stat health.ins     
##  Divorced/Separated:155   Mode :logical  
##  Married           :516   FALSE:159      
##  Never Married     :233   TRUE :841      
##  Widowed           : 96   NA's :0        
##                                          
##                                          
##                                          
##                        housing.type recent.move      num.vehicles  
##  Homeowner free and clear    :157   Mode :logical   Min.   :0.000  
##  Homeowner with mortgage/loan:412   FALSE:820       1st Qu.:1.000  
##  Occupied with no rent       : 11   TRUE :124       Median :2.000  
##  Rented                      :364   NA's :56        Mean   :1.916  
##  NA's                        : 56                   3rd Qu.:2.000  
##                                                     Max.   :6.000  
##                                                     NA's   :56     
##       age              state.of.res
##  Min.   :  0.0   California  :100  
##  1st Qu.: 38.0   New York    : 71  
##  Median : 50.0   Pennsylvania: 70  
##  Mean   : 51.7   Texas       : 56  
##  3rd Qu.: 64.0   Michigan    : 52  
##  Max.   :146.7   Ohio        : 51  
##                  (Other)     :600

str(custdata)

## 'data.frame':    1000 obs. of  11 variables:
##  $ custid      : int  2068 2073 2848 5641 6369 8322 8521 12195 14989 15917 ...
##  $ sex         : Factor w/ 2 levels "F","M": 1 1 2 2 1 1 2 2 2 1 ...
##  $ is.employed : logi  NA NA TRUE TRUE TRUE TRUE ...
##  $ income      : int  11300 0 4500 20000 12000 180000 120000 40000 9400 24000 ...
##  $ marital.stat: Factor w/ 4 levels "Divorced/Separated",..: 2 2 3 3 3 3 3 2 2 1 ...
##  $ health.ins  : logi  TRUE TRUE FALSE FALSE TRUE TRUE ...
##  $ housing.type: Factor w/ 4 levels "Homeowner free and clear",..: 1 4 4 3 4 2 1 4 4 1 ...
##  $ recent.move : logi  FALSE TRUE TRUE FALSE TRUE FALSE ...
##  $ num.vehicles: int  2 3 3 0 1 1 1 3 2 1 ...
##  $ age         : num  49 40 22 22 31 40 39 48 44 70 ...
##  $ state.of.res: Factor w/ 50 levels "Alabama","Alaska",..: 22 9 10 31 9 32 12 22 13 33 ...

summary(custdata[is.na(custdata$housing.type),
                 c("recent.move","num.vehicles")])#Restrict to the rows

##  recent.move     num.vehicles
##  Mode:logical   Min.   : NA  
##  NA's:56        1st Qu.: NA  
##                 Median : NA  
##                 Mean   :NaN  
##                 3rd Qu.: NA  
##                 Max.   : NA  
##                 NA's   :56

#where housing.type is NA
# These variables are only missing a few values. It's probably
# safe to just drop the rows that are missing values-especially
# if the missing values are all in the same 56 rows
# is.employed? Here you're missing data from a third of the customers. What do you
# do then?
# MISSING DATA IN CATEGORICAL VARIABLES
# The most straightforward solution is just to create a new category for the variable,
# called missing.
custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed),"missing",
                                   ifelse(custdata$is.employed==TRUE,"employed",
                                          "not employed"))
summary(as.factor(custdata$is.employed.fix))

##     employed      missing not employed 
##          599          328           73

custdata$is.employed.fix <- ifelse(is.na(custdata$is.employed),
                                   "not in active workforce",
                                   ifelse(custdata$is.employed==TRUE,
                                          "employed",
                                          "not employed"))
summary(as.factor(custdata$is.employed.fix))

##                employed            not employed not in active workforce 
##                     599                      73                     328

# MISSING VALUES IN NUMERIC DATA
# Suppose your income variable is missing substantial data:
summary(custdata$income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -8700   14600   35000   53500   67000  615000

breaks <- c(0,10000,50000,100000,250000,500000,1000000)
income.groups <- cut(custdata$income,breaks=breaks,include.lowest = TRUE)
summary(income.groups)

##       [0,1e+04]   (1e+04,5e+04]   (5e+04,1e+05] (1e+05,2.5e+05] 
##             184             469             215             105 
## (2.5e+05,5e+05]   (5e+05,1e+06]            NA's 
##              25               1               1

income.groups <- as.character(income.groups)
income.groups <- ifelse(is.na(income.groups),"no income",income.groups)
summary(as.factor(income.groups))

##   (1e+04,5e+04] (1e+05,2.5e+05] (2.5e+05,5e+05]   (5e+04,1e+05] 
##             469             105              25             215 
##   (5e+05,1e+06]       [0,1e+04]       no income 
##               1             184               1

aggregate(custdata$income~custdata$state.of.res,mean,data=custdata)

##    custdata$state.of.res custdata$income
## 1                Alabama        23018.18
## 2                 Alaska        54690.00
## 3                Arizona        49455.56
## 4               Arkansas        74957.14
## 5             California        53263.90
## 6               Colorado        52100.09
## 7            Connecticut        43035.71
## 8               Delaware        94700.00
## 9                Florida        49231.02
## 10               Georgia        28434.07
## 11                Hawaii        56640.00
## 12                 Idaho        53800.00
## 13              Illinois        62920.08
## 14               Indiana        61765.17
## 15                  Iowa        40360.00
## 16                Kansas        38475.00
## 17              Kentucky        34332.50
## 18             Louisiana        91756.67
## 19                 Maine       117416.67
## 20              Maryland        45844.12
## 21         Massachusetts        46972.50
## 22              Michigan        46511.35
## 23             Minnesota        72731.90
## 24           Mississippi        16714.29
## 25              Missouri        79145.71
## 26               Montana         9500.00
## 27              Nebraska        39825.00
## 28                Nevada        64500.00
## 29         New Hampshire        44010.00
## 30            New Jersey        60146.41
## 31            New Mexico        38466.67
## 32              New York        58183.39
## 33        North Carolina        56550.00
## 34          North Dakota            0.00
## 35                  Ohio        55840.98
## 36              Oklahoma        49863.64
## 37                Oregon        48457.14
## 38          Pennsylvania        54437.57
## 39          Rhode Island        59550.00
## 40        South Carolina        58106.00
## 41          South Dakota        66700.00
## 42             Tennessee        45791.82
## 43                 Texas        55959.64
## 44                  Utah        28440.00
## 45               Vermont        24666.67
## 46              Virginia        56792.07
## 47            Washington        60548.24
## 48         West Virginia        30725.38
## 49             Wisconsin        49926.61
## 50               Wyoming        10000.00

sum(sample(custdata$health.ins==TRUE))

## [1] 841

summary(custdata$health.ins)

##    Mode   FALSE    TRUE    NA's 
## logical     159     841       0

# objects(grep("faraway",search()
library(faraway)
data("fruitfly")
plot(fruitfly$longevity~fruitfly$activity)
aggregate(fruitfly$longevity~fruitfly$activity,mean,data = fruitfly)

##   fruitfly$activity fruitfly$longevity
## 1          isolated           63.56000
## 2               one           64.80000
## 3               low           56.76000
## 4              many           64.54167
## 5              high           38.72000

# K Means Clustering in R
# by Teja Kodali
library(ggplot2)

ggplot(iris,aes(iris$Petal.Length,
                iris$Petal.Width,
                color=iris$Species))+geom_point()

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

set.seed(20)
irisCluster <- kmeans(iris[,3:4],3,nstart = 20)
irisCluster

## K-means clustering with 3 clusters of sizes 50, 52, 48
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     1.462000    0.246000
## 2     4.269231    1.342308
## 3     5.595833    2.037500
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3
## [141] 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  2.02200 13.05769 16.29167
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

table(irisCluster$cluster,iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48         4
##   3      0          2        46

irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris,aes(iris$Petal.Length,iris$Petal.Width,
                color=irisCluster$cluster))+
  geom_point()

chapter_4.R

Luis

Sat Feb 20 22:30:40 2016