Data Manipulation & Cleaning

Shige

Preparation: Loading the packages

library(reshape2)
library(plyr)
library(ggplot2)
library(Zelig)
library(memisc)

We will be using the “turnout” data that come with the Zelig package

data(turnout)

Tasks

  • Describing data;
  • Renaming variables;
  • Recoding variables;
  • Generating new variables;
  • Subsetting data;
  • Merging data;
  • Reshaping data;
  • Labeling variables and values;

Summary: I

summary(turnout)
     race           age          educate         income     
 others: 292   Min.   :17.0   Min.   : 0.0   Min.   : 0.00  
 white :1708   1st Qu.:31.0   1st Qu.:10.0   1st Qu.: 1.74  
               Median :42.0   Median :12.0   Median : 3.35  
               Mean   :45.3   Mean   :12.1   Mean   : 3.89  
               3rd Qu.:59.0   3rd Qu.:14.0   3rd Qu.: 5.23  
               Max.   :95.0   Max.   :19.0   Max.   :14.93  
      vote      
 Min.   :0.000  
 1st Qu.:0.000  
 Median :1.000  
 Mean   :0.746  
 3rd Qu.:1.000  
 Max.   :1.000  

Summary: II

summary(turnout$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   17.0    31.0    42.0    45.3    59.0    95.0 

or

summary(turnout[2])
      age      
 Min.   :17.0  
 1st Qu.:31.0  
 Median :42.0  
 Mean   :45.3  
 3rd Qu.:59.0  
 Max.   :95.0  

Summary: III

attach(turnout)
summary(age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   17.0    31.0    42.0    45.3    59.0    95.0 
detach(turnout)

Rename

change <- c(age="Age", race="Race", educate="Educate", income="Income")
turnout <- rename(turnout, change)
names(turnout)
[1] "Race"    "Age"     "Educate" "Income"  "vote"   

Or using the “memisc” package:

turnout <- rename(turnout, 
                  Age = "var_age",
                  Race = "var_race")
names(turnout)
[1] "var_race" "var_age"  "Educate"  "Income"   "vote"    

Creating new variables

turnout$school <- turnout$Educate
names(turnout)
[1] "var_race" "var_age"  "Educate"  "Income"   "vote"     "school"  
table(turnout$school)

  0 0.5   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
  9   3   3   1  16  17  26  35  50 157  71 123  87 685 106 218  43 204 
 17  19 
 47  99 

Recoding

library(memisc)
table(turnout$school)

  0 0.5   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
  9   3   3   1  16  17  26  35  50 157  71 123  87 685 106 218  43 204 
 17  19 
 47  99 
turnout$school <- recode(turnout$Educate, 0 <- 0:0.5,
       1 <- 1:9,
       2 <- 10:14,
       3 <- 15:19)
table(turnout$school)

   0    1    2    3 
   9  376 1219  393 

Subsetting: I

new_turnout <- subset(turnout, select=c(school, vote))
names(new_turnout)
[1] "school" "vote"  

Subsetting: II

hist(turnout$var_age)

plot of chunk unnamed-chunk-14

young_people <- subset(turnout, var_age<40)
hist(young_people$var_age)

plot of chunk unnamed-chunk-15

Labeling: variable label

library(Hmisc)
label(turnout$var_race) <- "Race and ethnicity"
label(turnout$var_age) <- "Age of respondent"

Labeling: value label

table(turnout$school)

   0    1    2    3 
   9  376 1219  393 
turnout$school <- factor(
  turnout$school,
  levels=c(0, 1, 2, 3),
  labels=c("No schooling", "Elementary school", "High school", "College")
  )
table(turnout$school)

     No schooling Elementary school       High school           College 
                9               376              1219               393 

Merging

** TO BE ADDED **

Reshaping

** TO BE ADDED **