Trying out R for tidying data. This is my attempt to solve first excercise given at Data Curation Workshop at OR2014 preconference.
library('plyr')
df <- read.csv('./data1/data1.csv',
na.strings=c(' ',''))
tidy.names<-c(
'caseid',
'sample.type',
'timezone',
'census.region',
'census.division',
'abc.state',
'fips.state',
'msa.flag',
'csba.type',
'metropolitan.division',
'fips.county',
'zip.code',
'urbanity',
'percent.black',
'percent.asian',
'percent.hispanic',
'congressional.district',
'rdd.block.count',
'times.tried',
'gender',
'business.flag',
'company.type',
'registered.voter',
'ideology',
'education',
'associate.degree.type',
'education.breaks',
'college.education',
'age',
'age.breaks.remove',
'age.breaks',
'religion',
'religion.specified',
'religion.christian',
'religion.protestant',
'religion.born.again.christian',
'religion.nets',
'white.evangelical.protestant',
'veteran.status',
'race',
'race.net',
'income.before.taxes',
'income.levels.over.100k',
'reporter.call',
'gender',
'weight')
names(df) <- tidy.names
# Missing level 9. Values C = Micropolitan, M = Metropolitan
df$csba.type[df$csba.type == 9] <- NA
df$csba.type <- mapvalues(df$csba.type, from = c("C", "M", '9'), to = c("Micropolitan", "Metropolitan",NA))
# Missing level 9. Values R = Rural, S = Suburban, U = Urban
urbanity.levels <- c('Rural','Suburban','Urban')
df$urbanity[df$urbanity == 9] <- NA
df$urbanity <- mapvalues(df$urbanity, from = c('R','S','U','9'), to = c('Rural','Suburban','Urban', NA))
# Gender
df[!(df$gender == 'Female' | df$gender == 'Male'),]$gender <- NA
df$gender <- droplevels(df$gender)
# Business flag
df$business.flag <- as.factor(df$business.flag)
# Weight to actual numeric value
df$weight <- as.numeric(as.character(df$weight))
# Lets remove erroneus? fields
df <- df[,-45] # Removing erroneus gender field
df <- df[,-30] # Removing erroneus age field
df <- df[,-11] # Removing erroneus fips.county field, which has only NA values
Let's see if the result looks reasonable
summary(df)
## caseid sample.type timezone census.region
## Min. : 1 oversample: 97 Central :179 Midwest (CDiv 3,4) :144
## 1st Qu.: 280 rdd :525 Eastern :332 Northeast (CDiv 1,2):112
## Median : 566 Mountain: 27 South (CDiv 5,6,7) :257
## Mean : 558 Pacific : 84 West (CDiv 8,9) :109
## 3rd Qu.: 840
## Max. :1101
##
## census.division abc.state fips.state
## South Atlantic :142 California : 51 California : 50
## East North Central: 96 Texas : 43 Texas : 42
## Middle Atlantic : 87 Pennsylvania: 39 Pennsylvania: 33
## Pacific : 75 Florida : 36 Florida : 32
## West South Central: 70 New York : 30 New York : 27
## West North Central: 48 Ohio : 27 (Other) :359
## (Other) :104 (Other) :396 NA's : 79
## msa.flag csba.type
## MSA :441 Micropolitan: 60
## Non-MSA:102 Metropolitan:441
## NA's : 79 NA's :121
##
##
##
##
## metropolitan.division zip.code
## (COUNTIES NOT IN METROPOLITAN DIVISION):407 Mode:logical
## NEW YORK-WAYNE-WHITE PLAINS, NY : 12 NA's:622
## CHICAGO-NAPERVILLE-JOLIET, IL : 11
## DALLAS-PLANO-IRVING, TX : 11
## LOS ANGELES-LONG BEACH-GLENDALE, CA : 8
## (Other) : 94
## NA's : 79
## urbanity percent.black percent.asian percent.hispanic
## Rural :103 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Suburban:290 1st Qu.: 1.00 1st Qu.: 0.00 1st Qu.: 1.00
## Urban :150 Median : 3.00 Median : 1.00 Median : 2.00
## NA's : 79 Mean : 9.12 Mean : 2.25 Mean : 6.27
## 3rd Qu.:10.00 3rd Qu.: 3.00 3rd Qu.: 6.00
## Max. :98.00 Max. :40.00 Max. :111.00
## NA's :79 NA's :79 NA's :79
## congressional.district rdd.block.count times.tried gender
## Min. : 1.00 Min. : 2.0 Min. :1.00 Female:182
## 1st Qu.: 3.00 1st Qu.:21.0 1st Qu.:1.00 Male :432
## Median : 6.00 Median :32.0 Median :1.00 NA's : 8
## Mean : 8.76 Mean :31.4 Mean :1.97
## 3rd Qu.:11.00 3rd Qu.:41.0 3rd Qu.:2.00
## Max. :52.00 Max. :70.0 Max. :8.00
## NA's :79 NA's :79
## business.flag company.type registered.voter
## 0:231 long :539 No : 83
## 1:369 short: 83 Yes:539
## 2: 22
##
##
##
##
## ideology education
## (VOL) Don't think in those terms: 2 8th grade or less : 8
## Conservative :161 Graduated college :175
## Liberal :123 Graduated high school:152
## Moderate :243 Post-graduate : 68
## NA's : 93 Some college :183
## Some high school : 27
## NA's : 9
## associate.degree.type education.breaks
## Bachelor's degree:154 Graduated high school:152
## Other : 21 Less than high school: 35
## NA's :447 Some college + :426
## NA's : 9
##
##
##
## college.education age age.breaks
## College degree :243 Min. :18.0 18-29:104
## No college degree:370 1st Qu.:39.0 30-39: 55
## NA's : 9 Median :52.0 40-49:104
## Mean :51.1 50-64:206
## 3rd Qu.:63.0 65+ :137
## Max. :90.0 NA's : 16
## NA's :16
## religion religion.specified
## Catholic/Roman Catholic : 95 Non-Denominational : 16
## Baptist : 71 Other : 10
## Christian (Non-Protestant): 63 Orthodox (Any Mention): 3
## none : 63 Unitarian/Universalist: 3
## Protestant : 62 Church of Christ : 2
## (Other) :173 (Other) : 6
## NA's : 95 NA's :582
## religion.christian religion.protestant religion.born.again.christian
## No : 1 No : 56 No :230
## Yes : 15 Yes : 54 Yes :198
## NA's:606 NA's:512 NA's:194
##
##
##
##
## religion.nets
## Catholic : 95
## Christian (Non-Protestant): 70
## None : 64
## Other Non-Christian : 35
## Protestant :263
## NA's : 95
##
## white.evangelical.protestant
## White Evangelical Protestant:101
## NA's :521
##
##
##
##
##
## veteran.status
## No veterans in household :392
## Yes, both respondent and other household member: 5
## Yes, other household member is veteran :122
## Yes, respondent is veteran : 16
## NA's : 87
##
##
## race race.net
## White :455 Asian : 8
## Black : 98 Black : 98
## Other Race : 29 Hispanic NET: 22
## White Hispanic: 19 Other : 29
## Asian : 8 White :455
## (Other) : 3 NA's : 10
## NA's : 10
## income.before.taxes income.levels.over.100k
## 100 thousand or more :108 100 to under 150 thousand: 54
## 20 to under 35 thousand : 75 150 to under 200 thousand: 28
## 35 to under 50 thousand : 87 200 to under 250 thousand: 9
## 50 to under 75 thousand :100 250 thousand or more : 14
## 75 to under 100 thousand : 65 NA's :517
## Under 20 thousand dollars: 53
## NA's :134
## reporter.call weight
## No : 58 Min. :0.392
## Yes : 80 1st Qu.:0.566
## NA's:484 Median :0.869
## Mean :0.914
## 3rd Qu.:1.028
## Max. :4.695
##