#links to csv file
fdir <- "http://www2.census.gov/programs-surveys/popest/tables/2010-2011/state/totals/nst-est2011-01.csv"
#create function to read csv from url
readCensus <- function(fdir){
df <- read.csv(url(fdir), skip =3)
return (df)
}
df <- readCensus(fdir)
head(df)
## X Census Estimates.Base X2010 X2011 X.1 X.2
## 1 United States 308,745,538 308,745,538 309,330,219 311,591,917 NA NA
## 2 Northeast 55,317,240 55,317,244 55,366,108 55,521,598 NA NA
## 3 Midwest 66,927,001 66,926,987 66,976,458 67,158,835 NA NA
## 4 South 114,555,744 114,555,757 114,857,529 116,046,736 NA NA
## 5 West 71,945,553 71,945,550 72,130,124 72,864,748 NA NA
## 6 .Alabama 4,779,736 4,779,735 4,785,401 4,802,740 NA NA
## X.3 X.4 X.5
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
colnames(df)
## [1] "X" "Census" "Estimates.Base" "X2010"
## [5] "X2011" "X.1" "X.2" "X.3"
## [9] "X.4" "X.5"
df <- df[,c(1,2,3,4,5)]
head(df)
## X Census Estimates.Base X2010 X2011
## 1 United States 308,745,538 308,745,538 309,330,219 311,591,917
## 2 Northeast 55,317,240 55,317,244 55,366,108 55,521,598
## 3 Midwest 66,927,001 66,926,987 66,976,458 67,158,835
## 4 South 114,555,744 114,555,757 114,857,529 116,046,736
## 5 West 71,945,553 71,945,550 72,130,124 72,864,748
## 6 .Alabama 4,779,736 4,779,735 4,785,401 4,802,740
tail(df)
## X
## 58 Puerto Rico
## 59 Note: The April 1, 2010 Population Estimates base reflects changes to the Census 2010 population from geographic program revisions. It does not reflect changes from the Count Question Resolution program. See Geographic Terms and Definitions at http://www.census.gov/popest/about/geo/terms.html for a list of the states that are included in each region.
## 60 Suggested Citation:
## 61 Table 1. Annual Estimates of the Population for the United States, Regions, States, and Puerto Rico: April 1, 2010 to July 1, 2011 (NST-EST2011-01)
## 62 Source: U.S. Census Bureau, Population Division
## 63 Release Date: December 2011
## Census Estimates.Base X2010 X2011
## 58 3,725,789 3,725,789 3,721,978 3,706,690
## 59
## 60
## 61
## 62
## 63
dfState <- df[6:56,]
head(dfState)
## X Census Estimates.Base X2010 X2011
## 6 .Alabama 4,779,736 4,779,735 4,785,401 4,802,740
## 7 .Alaska 710,231 710,231 714,146 722,718
## 8 .Arizona 6,392,017 6,392,013 6,413,158 6,482,505
## 9 .Arkansas 2,915,918 2,915,921 2,921,588 2,937,979
## 10 .California 37,253,956 37,253,956 37,338,198 37,691,912
## 11 .Colorado 5,029,196 5,029,196 5,047,692 5,116,796
colnames(dfState)<- c('stateName', 'base2010', 'base2011','Jul2010', 'Jul2011')
summary(dfState)
## stateName base2010 base2011 Jul2010
## .Alabama : 1 1,052,567: 1 1,052,567: 1 1,052,528: 1
## .Alaska : 1 1,316,470: 1 1,316,472: 1 1,316,807: 1
## .Arizona : 1 1,328,361: 1 1,328,361: 1 1,327,379: 1
## .Arkansas : 1 1,360,301: 1 1,360,301: 1 1,363,359: 1
## .California: 1 1,567,582: 1 1,567,582: 1 1,571,102: 1
## .Colorado : 1 1,826,341: 1 1,826,341: 1 1,830,141: 1
## (Other) :45 (Other) :45 (Other) :45 (Other) :45
## Jul2011
## 1,051,302: 1
## 1,318,194: 1
## 1,328,188: 1
## 1,374,810: 1
## 1,584,985: 1
## 1,842,641: 1
## (Other) :45
dfState$base2010 <- as.integer(gsub(',','',dfState$base2010))
dfState$base2011 <- as.integer(gsub(',', '',dfState$base2011))
dfState$Jul2010 <- as.integer(gsub(',','',dfState$Jul2010))
dfState$Jul2011 <- as.integer(gsub(',','',dfState$Jul2011))
dfState$stateName <- gsub('^[.]{1}','',dfState$stateName)
summary(dfState)
## stateName base2010 base2011
## Length:51 Min. : 563626 Min. : 563626
## Class :character 1st Qu.: 1696962 1st Qu.: 1696962
## Mode :character Median : 4339367 Median : 4339362
## Mean : 6053834 Mean : 6053834
## 3rd Qu.: 6636084 3rd Qu.: 6636084
## Max. :37253956 Max. :37253956
## Jul2010 Jul2011
## Min. : 564554 Min. : 568158
## 1st Qu.: 1700622 1st Qu.: 1713813
## Median : 4347223 Median : 4369356
## Mean : 6065298 Mean : 6109645
## 3rd Qu.: 6649208 3rd Qu.: 6708787
## Max. :37338198 Max. :37691912
head(dfState)
## stateName base2010 base2011 Jul2010 Jul2011
## 6 Alabama 4779736 4779735 4785401 4802740
## 7 Alaska 710231 710231 714146 722718
## 8 Arizona 6392017 6392013 6413158 6482505
## 9 Arkansas 2915918 2915921 2921588 2937979
## 10 California 37253956 37253956 37338198 37691912
## 11 Colorado 5029196 5029196 5047692 5116796
#create a function
distfunc <- function(x,y) # x is a vector and y is a number.
{
value <- length(which(x<y))/length(x)
return(value)
}
distfunc(dfState$Jul2011, mean(dfState$Jul2011))
## [1] 0.6666667
There are 66% of the state with their population below the population mean of 2011
readCensus <- function(url){
df <- read.csv(url(fdir), skip =3)
df <- df[,c(1,2,3,4,5)]
df <- df[6:56,]
colnames(df) <- c('stateName', 'base2010', 'base2011','Jul2010', 'Jul2011')
df$base2010 <- as.integer(gsub(',','',df$base2010))
df$base2011 <- as.integer(gsub(',', '',df$base2011))
df$Jul2010 <- as.integer(gsub(',','',df$Jul2010))
df$Jul2011 <- as.integer(gsub(',','',df$Jul2011))
df$stateName <- gsub('^[.]{1}','',df$stateName)
return(df)
}