Open Repositories 2014 Practical Data Curation and Management for Institutional Repositories -workshop

Trying out R for tidying data. This is my attempt to solve first excercise given at Data Curation Workshop at OR2014 preconference.

library('plyr')
df <- read.csv('./data1/data1.csv',
               na.strings=c(' ',''))

tidy.names<-c(
  'caseid',
  'sample.type',
  'timezone',
  'census.region',
  'census.division',
  'abc.state',
  'fips.state',
  'msa.flag',
  'csba.type',
  'metropolitan.division',
  'fips.county',
  'zip.code',
  'urbanity',
  'percent.black',
  'percent.asian',
  'percent.hispanic',
  'congressional.district',
  'rdd.block.count',
  'times.tried',
  'gender',
  'business.flag',
  'company.type',
  'registered.voter',
  'ideology',
  'education',
  'associate.degree.type',
  'education.breaks',
  'college.education',
  'age',
  'age.breaks.remove',
  'age.breaks',
  'religion',
  'religion.specified',
  'religion.christian',
  'religion.protestant',
  'religion.born.again.christian',
  'religion.nets',
  'white.evangelical.protestant',
  'veteran.status',
  'race',
  'race.net',
  'income.before.taxes',
  'income.levels.over.100k',
  'reporter.call',
  'gender',
  'weight')

names(df) <- tidy.names

# Missing level 9. Values C = Micropolitan, M = Metropolitan
df$csba.type[df$csba.type == 9] <- NA
df$csba.type <- mapvalues(df$csba.type, from = c("C", "M", '9'), to = c("Micropolitan", "Metropolitan",NA))

# Missing level 9. Values R = Rural, S = Suburban, U = Urban
urbanity.levels <- c('Rural','Suburban','Urban')

df$urbanity[df$urbanity == 9] <- NA
df$urbanity <- mapvalues(df$urbanity, from = c('R','S','U','9'), to = c('Rural','Suburban','Urban', NA))

# Gender

df[!(df$gender == 'Female' | df$gender == 'Male'),]$gender <- NA
df$gender <- droplevels(df$gender)

# Business flag

df$business.flag <- as.factor(df$business.flag)

# Weight to actual numeric value

df$weight <- as.numeric(as.character(df$weight))

# Lets remove erroneus? fields

df <- df[,-45] # Removing erroneus gender field
df <- df[,-30] # Removing erroneus age field
df <- df[,-11] # Removing erroneus fips.county field, which has only NA values

Let's see if the result looks reasonable

summary(df)
##      caseid         sample.type      timezone                census.region
##  Min.   :   1   oversample: 97   Central :179   Midwest (CDiv 3,4)  :144  
##  1st Qu.: 280   rdd       :525   Eastern :332   Northeast (CDiv 1,2):112  
##  Median : 566                    Mountain: 27   South (CDiv 5,6,7)  :257  
##  Mean   : 558                    Pacific : 84   West (CDiv 8,9)     :109  
##  3rd Qu.: 840                                                             
##  Max.   :1101                                                             
##                                                                           
##            census.division        abc.state          fips.state 
##  South Atlantic    :142    California  : 51   California  : 50  
##  East North Central: 96    Texas       : 43   Texas       : 42  
##  Middle Atlantic   : 87    Pennsylvania: 39   Pennsylvania: 33  
##  Pacific           : 75    Florida     : 36   Florida     : 32  
##  West South Central: 70    New York    : 30   New York    : 27  
##  West North Central: 48    Ohio        : 27   (Other)     :359  
##  (Other)           :104    (Other)     :396   NA's        : 79  
##     msa.flag          csba.type  
##  MSA    :441   Micropolitan: 60  
##  Non-MSA:102   Metropolitan:441  
##  NA's   : 79   NA's        :121  
##                                  
##                                  
##                                  
##                                  
##                              metropolitan.division zip.code      
##  (COUNTIES NOT IN METROPOLITAN DIVISION):407       Mode:logical  
##  NEW YORK-WAYNE-WHITE PLAINS, NY        : 12       NA's:622      
##  CHICAGO-NAPERVILLE-JOLIET, IL          : 11                     
##  DALLAS-PLANO-IRVING, TX                : 11                     
##  LOS ANGELES-LONG BEACH-GLENDALE, CA    :  8                     
##  (Other)                                : 94                     
##  NA's                                   : 79                     
##      urbanity   percent.black   percent.asian   percent.hispanic
##  Rural   :103   Min.   : 0.00   Min.   : 0.00   Min.   :  0.00  
##  Suburban:290   1st Qu.: 1.00   1st Qu.: 0.00   1st Qu.:  1.00  
##  Urban   :150   Median : 3.00   Median : 1.00   Median :  2.00  
##  NA's    : 79   Mean   : 9.12   Mean   : 2.25   Mean   :  6.27  
##                 3rd Qu.:10.00   3rd Qu.: 3.00   3rd Qu.:  6.00  
##                 Max.   :98.00   Max.   :40.00   Max.   :111.00  
##                 NA's   :79      NA's   :79      NA's   :79      
##  congressional.district rdd.block.count  times.tried      gender   
##  Min.   : 1.00          Min.   : 2.0    Min.   :1.00   Female:182  
##  1st Qu.: 3.00          1st Qu.:21.0    1st Qu.:1.00   Male  :432  
##  Median : 6.00          Median :32.0    Median :1.00   NA's  :  8  
##  Mean   : 8.76          Mean   :31.4    Mean   :1.97               
##  3rd Qu.:11.00          3rd Qu.:41.0    3rd Qu.:2.00               
##  Max.   :52.00          Max.   :70.0    Max.   :8.00               
##  NA's   :79             NA's   :79                                 
##  business.flag company.type registered.voter
##  0:231         long :539    No : 83         
##  1:369         short: 83    Yes:539         
##  2: 22                                      
##                                             
##                                             
##                                             
##                                             
##                              ideology                   education  
##  (VOL) Don't think in those terms:  2   8th grade or less    :  8  
##  Conservative                    :161   Graduated college    :175  
##  Liberal                         :123   Graduated high school:152  
##  Moderate                        :243   Post-graduate        : 68  
##  NA's                            : 93   Some college         :183  
##                                         Some high school     : 27  
##                                         NA's                 :  9  
##        associate.degree.type              education.breaks
##  Bachelor's degree:154       Graduated high school:152    
##  Other            : 21       Less than high school: 35    
##  NA's             :447       Some college +       :426    
##                              NA's                 :  9    
##                                                           
##                                                           
##                                                           
##          college.education      age       age.breaks 
##  College degree   :243     Min.   :18.0   18-29:104  
##  No college degree:370     1st Qu.:39.0   30-39: 55  
##  NA's             :  9     Median :52.0   40-49:104  
##                            Mean   :51.1   50-64:206  
##                            3rd Qu.:63.0   65+  :137  
##                            Max.   :90.0   NA's : 16  
##                            NA's   :16                
##                        religion                religion.specified
##  Catholic/Roman Catholic   : 95   Non-Denominational    : 16     
##  Baptist                   : 71   Other                 : 10     
##  Christian (Non-Protestant): 63   Orthodox (Any Mention):  3     
##  none                      : 63   Unitarian/Universalist:  3     
##  Protestant                : 62   Church of Christ      :  2     
##  (Other)                   :173   (Other)               :  6     
##  NA's                      : 95   NA's                  :582     
##  religion.christian religion.protestant religion.born.again.christian
##  No  :  1           No  : 56            No  :230                     
##  Yes : 15           Yes : 54            Yes :198                     
##  NA's:606           NA's:512            NA's:194                     
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                     religion.nets
##  Catholic                  : 95  
##  Christian (Non-Protestant): 70  
##  None                      : 64  
##  Other Non-Christian       : 35  
##  Protestant                :263  
##  NA's                      : 95  
##                                  
##                white.evangelical.protestant
##  White Evangelical Protestant:101          
##  NA's                        :521          
##                                            
##                                            
##                                            
##                                            
##                                            
##                                          veteran.status
##  No veterans in household                       :392   
##  Yes, both respondent and other household member:  5   
##  Yes, other household member is veteran         :122   
##  Yes, respondent is veteran                     : 16   
##  NA's                                           : 87   
##                                                        
##                                                        
##              race             race.net  
##  White         :455   Asian       :  8  
##  Black         : 98   Black       : 98  
##  Other Race    : 29   Hispanic NET: 22  
##  White Hispanic: 19   Other       : 29  
##  Asian         :  8   White       :455  
##  (Other)       :  3   NA's        : 10  
##  NA's          : 10                     
##                 income.before.taxes              income.levels.over.100k
##  100 thousand or more     :108      100 to under 150 thousand: 54       
##  20 to under 35 thousand  : 75      150 to under 200 thousand: 28       
##  35 to under 50 thousand  : 87      200 to under 250 thousand:  9       
##  50 to under 75 thousand  :100      250 thousand or more     : 14       
##  75 to under 100 thousand : 65      NA's                     :517       
##  Under 20 thousand dollars: 53                                          
##  NA's                     :134                                          
##  reporter.call     weight     
##  No  : 58      Min.   :0.392  
##  Yes : 80      1st Qu.:0.566  
##  NA's:484      Median :0.869  
##                Mean   :0.914  
##                3rd Qu.:1.028  
##                Max.   :4.695  
##