Data Munging

Cleaning a Dataframe

#links to csv file
fdir <- "http://www2.census.gov/programs-surveys/popest/tables/2010-2011/state/totals/nst-est2011-01.csv"

#create function to read csv from url

readCensus <- function(fdir){
  df <- read.csv(url(fdir), skip =3)
  return (df)
}

df <- readCensus(fdir)
head(df)

##               X      Census Estimates.Base       X2010       X2011 X.1 X.2
## 1 United States 308,745,538    308,745,538 309,330,219 311,591,917  NA  NA
## 2     Northeast  55,317,240     55,317,244  55,366,108  55,521,598  NA  NA
## 3       Midwest  66,927,001     66,926,987  66,976,458  67,158,835  NA  NA
## 4         South 114,555,744    114,555,757 114,857,529 116,046,736  NA  NA
## 5          West  71,945,553     71,945,550  72,130,124  72,864,748  NA  NA
## 6      .Alabama   4,779,736      4,779,735   4,785,401   4,802,740  NA  NA
##   X.3 X.4 X.5
## 1  NA  NA  NA
## 2  NA  NA  NA
## 3  NA  NA  NA
## 4  NA  NA  NA
## 5  NA  NA  NA
## 6  NA  NA  NA

check the columns in the dataframe

colnames(df)

##  [1] "X"              "Census"         "Estimates.Base" "X2010"         
##  [5] "X2011"          "X.1"            "X.2"            "X.3"           
##  [9] "X.4"            "X.5"

we want column 1,2,3,4, and 5. create a new dataframe with the required columns

df <- df[,c(1,2,3,4,5)]
head(df)

##               X      Census Estimates.Base       X2010       X2011
## 1 United States 308,745,538    308,745,538 309,330,219 311,591,917
## 2     Northeast  55,317,240     55,317,244  55,366,108  55,521,598
## 3       Midwest  66,927,001     66,926,987  66,976,458  67,158,835
## 4         South 114,555,744    114,555,757 114,857,529 116,046,736
## 5          West  71,945,553     71,945,550  72,130,124  72,864,748
## 6      .Alabama   4,779,736      4,779,735   4,785,401   4,802,740

tail of the dataframe

tail(df)

##                                                                                                                                                                                                                                                                                                                                                                     X
## 58                                                                                                                                                                                                                                                                                                                                                        Puerto Rico
## 59 Note: The April 1, 2010 Population Estimates base reflects changes to the Census 2010 population from geographic program revisions.  It does not reflect changes from the Count Question Resolution program.  See Geographic Terms and Definitions at http://www.census.gov/popest/about/geo/terms.html for a list of the states that are included in each region.
## 60                                                                                                                                                                                                                                                                                                                                                Suggested Citation:
## 61                                                                                                                                                                                                                Table 1. Annual Estimates of the Population for the United States, Regions, States, and Puerto Rico: April 1, 2010 to July 1, 2011 (NST-EST2011-01)
## 62                                                                                                                                                                                                                                                                                                                    Source: U.S. Census Bureau, Population Division
## 63                                                                                                                                                                                                                                                                                                                                        Release Date: December 2011
##       Census Estimates.Base     X2010     X2011
## 58 3,725,789      3,725,789 3,721,978 3,706,690
## 59                                             
## 60                                             
## 61                                             
## 62                                             
## 63

dfState <- df[6:56,]
head(dfState)

##              X     Census Estimates.Base      X2010      X2011
## 6     .Alabama  4,779,736      4,779,735  4,785,401  4,802,740
## 7      .Alaska    710,231        710,231    714,146    722,718
## 8     .Arizona  6,392,017      6,392,013  6,413,158  6,482,505
## 9    .Arkansas  2,915,918      2,915,921  2,921,588  2,937,979
## 10 .California 37,253,956     37,253,956 37,338,198 37,691,912
## 11   .Colorado  5,029,196      5,029,196  5,047,692  5,116,796

rename the columns

colnames(dfState)<- c('stateName',   'base2010',    'base2011','Jul2010', 'Jul2011')
summary(dfState)

##        stateName       base2010       base2011       Jul2010  
##  .Alabama   : 1   1,052,567: 1   1,052,567: 1   1,052,528: 1  
##  .Alaska    : 1   1,316,470: 1   1,316,472: 1   1,316,807: 1  
##  .Arizona   : 1   1,328,361: 1   1,328,361: 1   1,327,379: 1  
##  .Arkansas  : 1   1,360,301: 1   1,360,301: 1   1,363,359: 1  
##  .California: 1   1,567,582: 1   1,567,582: 1   1,571,102: 1  
##  .Colorado  : 1   1,826,341: 1   1,826,341: 1   1,830,141: 1  
##  (Other)    :45   (Other)  :45   (Other)  :45   (Other)  :45  
##       Jul2011  
##  1,051,302: 1  
##  1,318,194: 1  
##  1,328,188: 1  
##  1,374,810: 1  
##  1,584,985: 1  
##  1,842,641: 1  
##  (Other)  :45

remove the special character and convert the datatype to the appriopriate datatype

dfState$base2010 <- as.integer(gsub(',','',dfState$base2010))
dfState$base2011 <- as.integer(gsub(',', '',dfState$base2011))
dfState$Jul2010 <- as.integer(gsub(',','',dfState$Jul2010))
dfState$Jul2011 <- as.integer(gsub(',','',dfState$Jul2011))
dfState$stateName <- gsub('^[.]{1}','',dfState$stateName)
summary(dfState)

##   stateName            base2010           base2011       
##  Length:51          Min.   :  563626   Min.   :  563626  
##  Class :character   1st Qu.: 1696962   1st Qu.: 1696962  
##  Mode  :character   Median : 4339367   Median : 4339362  
##                     Mean   : 6053834   Mean   : 6053834  
##                     3rd Qu.: 6636084   3rd Qu.: 6636084  
##                     Max.   :37253956   Max.   :37253956  
##     Jul2010            Jul2011        
##  Min.   :  564554   Min.   :  568158  
##  1st Qu.: 1700622   1st Qu.: 1713813  
##  Median : 4347223   Median : 4369356  
##  Mean   : 6065298   Mean   : 6109645  
##  3rd Qu.: 6649208   3rd Qu.: 6708787  
##  Max.   :37338198   Max.   :37691912

head(dfState)

##     stateName base2010 base2011  Jul2010  Jul2011
## 6     Alabama  4779736  4779735  4785401  4802740
## 7      Alaska   710231   710231   714146   722718
## 8     Arizona  6392017  6392013  6413158  6482505
## 9    Arkansas  2915918  2915921  2921588  2937979
## 10 California 37253956 37253956 37338198 37691912
## 11   Colorado  5029196  5029196  5047692  5116796

find the state with there population below the population mean of 2011

#create a function
distfunc <- function(x,y) # x is a vector and y is a number. 
{
  value <- length(which(x<y))/length(x)
  return(value)
}


distfunc(dfState$Jul2011, mean(dfState$Jul2011))

## [1] 0.6666667

There are 66% of the state with their population below the population mean of 2011

put all the cleaning steps into a single function

readCensus <- function(url){
  df <- read.csv(url(fdir), skip =3)
  df <- df[,c(1,2,3,4,5)]
  df <- df[6:56,]
  colnames(df) <- c('stateName',     'base2010',    'base2011','Jul2010', 'Jul2011')
  df$base2010 <- as.integer(gsub(',','',df$base2010))
  df$base2011 <- as.integer(gsub(',', '',df$base2011))
  df$Jul2010 <- as.integer(gsub(',','',df$Jul2010))
  df$Jul2011 <- as.integer(gsub(',','',df$Jul2011))
  df$stateName <- gsub('^[.]{1}','',df$stateName)
  
  return(df)
}