1. Download the dataset and read the CSV file: HW1_US_Counties.csv
# Some house keep jobs:
# Set working directory:
setwd("C:/Users/Andrew/SkyDrive/workspace_R")
dirpath <- "C:/Users/Andrew/SkyDrive/workspace_R/data"
dir(dirpath, full=TRUE) # make sure the
# Download and save the HW1_US_Counties.csv in the "C:/Users/Andrew/SkyDrive/workspace_R/data"
# Read the file:
HW1_US_Counties <- read.csv("data/HW1_US_Counties.csv", stringsAsFactors = FALSE)
#verify:
dim(HW1_US_Counties)
names(HW1_US_Counties)
# some preparation
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.2
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.2
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# Covert to local data frame to make it more reaable & presentable
USCounties_df <- tbl_df(HW1_US_Counties)
USCounties_df
# A few utilities to get understand the file:
head(USCounties_df)
tail(USCounties_df)
names(USCounties_df)
length(USCounties_df)
str(USCounties_df)
class(USCounties_df)
dim(USCounties_df)
USCounties_df %>% sample_n(5)
USCounties_df %>% sample_frac(0.25, replac=TRUE)
glimpse(USCounties_df)
2. Rename Column-headers with clearer Names.
For example:
Rename col #9 from "POPESTIMATE2013" into: "POPEST2013",
# 1. Use dpyr:
# rename col #13 from Col#8 from "CENSUS2010POP" into "CENSUS2010" and col# 12 from "NATURALINC2013" into: "INC2013":
# Note: the rename will be saved into the data set directly!
names(USCounties_df)[names(USCounties_df)=="CENSUS2010POP"] <- "CENSUS2010"
names(USCounties_df)[names(USCounties_df)=="NATURALINC2013"] <- "INC2013"
# We can do this by two steps into one:
# names(uscounties)[c(1,2,8)] <- c("State_Name", "County_Name", "Pop_2010")
head(USCounties_df)
## Source: local data frame [6 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1 Alabama Autauga County 50 3 6 1 1 54571
## 2 Alabama Baldwin County 50 3 6 1 3 182265
## 3 Alabama Barbour County 50 3 6 1 5 27457
## 4 Alabama Bibb County 50 3 6 1 7 22915
## 5 Alabama Blount County 50 3 6 1 9 57322
## 6 Alabama Bullock County 50 3 6 1 11 10914
## Variables not shown: POPESTIMATE2013 (int), BIRTHS2013 (int), DEATHS2013
## (int), INC2013 (int), INTERNATIONALMIG2013 (int), USPS (chr), ALAND_SQMI
## (dbl), AWATER_SQMI (dbl)
# 2. Use R built-in function
# rename col #13 from Col#9 from "POPESTIMATE2013" into "POPEST2013" and col# 13 from "INTERNATIONALMIG2013" into: "INTER'LMIG2013":
# Note: the result won't be saved into the data set.
rename(USCounties_df, c("POPESTIMATE2013"="POPEST2013", "INTERNATIONALMIG2013"="INTER'LMIG2013"))
## Source: local data frame [3,144 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1 Alabama Autauga County 50 3 6 1 1 54571
## 2 Alabama Baldwin County 50 3 6 1 3 182265
## 3 Alabama Barbour County 50 3 6 1 5 27457
## 4 Alabama Bibb County 50 3 6 1 7 22915
## 5 Alabama Blount County 50 3 6 1 9 57322
## 6 Alabama Bullock County 50 3 6 1 11 10914
## 7 Alabama Butler County 50 3 6 1 13 20947
## 8 Alabama Calhoun County 50 3 6 1 15 118572
## 9 Alabama Chambers County 50 3 6 1 17 34215
## 10 Alabama Cherokee County 50 3 6 1 19 25989
## .. ... ... ... ... ... ... ... ...
## Variables not shown: POPEST2013 (int), BIRTHS2013 (int), DEATHS2013 (int),
## INC2013 (int), INTER'LMIG2013 (int), USPS (chr), ALAND_SQMI (dbl),
## AWATER_SQMI (dbl)
names(USCounties_df)
## [1] "STNAME" "CTYNAME" "SUMLEV"
## [4] "REGION" "DIVISION" "STATE"
## [7] "COUNTY" "CENSUS2010" "POPESTIMATE2013"
## [10] "BIRTHS2013" "DEATHS2013" "INC2013"
## [13] "INTERNATIONALMIG2013" "USPS" "ALAND_SQMI"
## [16] "AWATER_SQMI"
names(USCounties_df)[15]
## [1] "ALAND_SQMI"
## Tam's Solution: We can do this in one command, by passing the column numbers to the names() function.
names(USCounties_df)[15] <- "Area"
names(USCounties_df)
## [1] "STNAME" "CTYNAME" "SUMLEV"
## [4] "REGION" "DIVISION" "STATE"
## [7] "COUNTY" "CENSUS2010" "POPESTIMATE2013"
## [10] "BIRTHS2013" "DEATHS2013" "INC2013"
## [13] "INTERNATIONALMIG2013" "USPS" "Area"
## [16] "AWATER_SQMI"
3. Sort All US Counties by Decreasing Area and print it out
# Use R built-in function
# I don't know how to sort the column for the df using R built-in function?
# using dplyr approach:
arrange(USCounties_df, desc(CTYNAME))
## Source: local data frame [3,144 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 1 South Dakota Ziebach County 50 2 4 46
## 2 Texas Zavala County 50 3 7 48
## 3 Texas Zapata County 50 3 7 48
## 4 Arizona Yuma County 50 4 8 4
## 5 Colorado Yuma County 50 4 8 8
## 6 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 7 California Yuba County 50 4 9 6
## 8 Texas Young County 50 3 7 48
## 9 Maine York County 50 1 1 23
## 10 Nebraska York County 50 2 4 31
## .. ... ... ... ... ... ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
## (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
## INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)
arrange(USCounties_df, desc(Area))
## Source: local data frame [3,144 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 1 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 2 Alaska North Slope Borough 50 4 9 2
## 3 Alaska Bethel Census Area 50 4 9 2
## 4 Alaska Northwest Arctic Borough 50 4 9 2
## 5 Alaska Valdez-Cordova Census Area 50 4 9 2
## 6 Alaska Southeast Fairbanks Census Area 50 4 9 2
## 7 Alaska Matanuska-Susitna Borough 50 4 9 2
## 8 Alaska Lake and Peninsula Borough 50 4 9 2
## 9 Alaska Nome Census Area 50 4 9 2
## 10 California San Bernardino County 50 4 9 6
## .. ... ... ... ... ... ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
## (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
## INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)
## Ram's way:
# To sort, we use the order() function.
# You can store the sorted data into another data frame, if you wish to.
# Use HW1_US_Counties
USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]
## Source: local data frame [3,144 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 1 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 2 Alaska North Slope Borough 50 4 9 2
## 3 Alaska Bethel Census Area 50 4 9 2
## 4 Alaska Northwest Arctic Borough 50 4 9 2
## 5 Alaska Valdez-Cordova Census Area 50 4 9 2
## 6 Alaska Southeast Fairbanks Census Area 50 4 9 2
## 7 Alaska Matanuska-Susitna Borough 50 4 9 2
## 8 Alaska Lake and Peninsula Borough 50 4 9 2
## 9 Alaska Nome Census Area 50 4 9 2
## 10 California San Bernardino County 50 4 9 6
## .. ... ... ... ... ... ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
## (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
## INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)
head(USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ] ,10)
## Source: local data frame [10 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 1 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 2 Alaska North Slope Borough 50 4 9 2
## 3 Alaska Bethel Census Area 50 4 9 2
## 4 Alaska Northwest Arctic Borough 50 4 9 2
## 5 Alaska Valdez-Cordova Census Area 50 4 9 2
## 6 Alaska Southeast Fairbanks Census Area 50 4 9 2
## 7 Alaska Matanuska-Susitna Borough 50 4 9 2
## 8 Alaska Lake and Peninsula Borough 50 4 9 2
## 9 Alaska Nome Census Area 50 4 9 2
## 10 California San Bernardino County 50 4 9 6
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
## (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
## INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)
# or: sort "CTYNAME" for the HW1_US_Counties:
head(HW1_US_Counties[order(USCounties_df$CTYNAME, decreasing=TRUE), ], 10)
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 2429 South Dakota Ziebach County 50 2 4 46
## 2778 Texas Zavala County 50 3 7 48
## 2777 Texas Zapata County 50 3 7 48
## 111 Arizona Yuma County 50 4 8 4
## 308 Colorado Yuma County 50 4 8 8
## 96 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 244 California Yuba County 50 4 9 6
## 2776 Texas Young County 50 3 7 48
## 1194 Maine York County 50 1 1 23
## 1748 Nebraska York County 50 2 4 31
## COUNTY CENSUS2010POP POPESTIMATE2013 BIRTHS2013 DEATHS2013
## 2429 137 2801 2834 39 19
## 2778 507 11677 12156 204 90
## 2777 505 14018 14390 283 94
## 111 27 195751 201201 3147 1414
## 308 125 10043 10151 154 91
## 96 290 5588 5695 107 65
## 244 115 72155 73340 1282 471
## 2776 503 18550 18341 222 231
## 1194 31 197131 199431 1831 1739
## 1748 185 13665 13883 177 130
## NATURALINC2013 INTERNATIONALMIG2013 USPS ALAND_SQMI AWATER_SQMI
## 2429 20 2 SD 1961.386 9.348
## 2778 114 12 TX 1297.406 4.328
## 2777 189 3 TX 998.412 59.603
## 111 1733 219 AZ 5513.997 5.087
## 308 63 15 CO 2364.394 4.299
## 96 42 0 AK 145505.165 2299.769
## 244 811 91 CA 632.020 11.969
## 2776 -9 -2 TX 914.469 16.389
## 1194 92 59 ME 990.748 278.965
## 1748 47 8 NE 572.510 3.319
# Same result for by using the dataframe:
USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]
## Source: local data frame [3,144 x 16]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE
## 1 Alaska Yukon-Koyukuk Census Area 50 4 9 2
## 2 Alaska North Slope Borough 50 4 9 2
## 3 Alaska Bethel Census Area 50 4 9 2
## 4 Alaska Northwest Arctic Borough 50 4 9 2
## 5 Alaska Valdez-Cordova Census Area 50 4 9 2
## 6 Alaska Southeast Fairbanks Census Area 50 4 9 2
## 7 Alaska Matanuska-Susitna Borough 50 4 9 2
## 8 Alaska Lake and Peninsula Borough 50 4 9 2
## 9 Alaska Nome Census Area 50 4 9 2
## 10 California San Bernardino County 50 4 9 6
## .. ... ... ... ... ... ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
## (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
## INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)
5. Create a new column to calculate the population density of each County.
# 1. R bult-in approach"
USCounties_df$POPDENSITY <- USCounties_df$CENSUS2010 / USCounties_df$Area
names(USCounties_df)
## [1] "STNAME" "CTYNAME" "SUMLEV"
## [4] "REGION" "DIVISION" "STATE"
## [7] "COUNTY" "CENSUS2010" "POPESTIMATE2013"
## [10] "BIRTHS2013" "DEATHS2013" "INC2013"
## [13] "INTERNATIONALMIG2013" "USPS" "Area"
## [16] "AWATER_SQMI" "POPDENSITY"
head(USCounties_df)
## Source: local data frame [6 x 17]
##
## STNAME CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1 Alabama Autauga County 50 3 6 1 1 54571
## 2 Alabama Baldwin County 50 3 6 1 3 182265
## 3 Alabama Barbour County 50 3 6 1 5 27457
## 4 Alabama Bibb County 50 3 6 1 7 22915
## 5 Alabama Blount County 50 3 6 1 9 57322
## 6 Alabama Bullock County 50 3 6 1 11 10914
## Variables not shown: POPESTIMATE2013 (int), BIRTHS2013 (int), DEATHS2013
## (int), INC2013 (int), INTERNATIONALMIG2013 (int), USPS (chr), Area
## (dbl), AWATER_SQMI (dbl), POPDENSITY (dbl)
# 2. dplyr approach
USCounties_df %>%
select(STNAME, CTYNAME, COUNTY, CENSUS2010, POPDENSITY,Area ) %>% # This line can be ommited.
mutate(POPDENSITY2 = CENSUS2010/Area)
## Source: local data frame [3,144 x 7]
##
## STNAME CTYNAME COUNTY CENSUS2010 POPDENSITY Area
## 1 Alabama Autauga County 1 54571 91.80283 594.437
## 2 Alabama Baldwin County 3 182265 114.64361 1589.840
## 3 Alabama Barbour County 5 27457 31.02921 884.876
## 4 Alabama Bibb County 7 22915 36.80634 622.583
## 5 Alabama Blount County 9 57322 88.89792 644.807
## 6 Alabama Bullock County 11 10914 17.52394 622.805
## 7 Alabama Butler County 13 20947 26.96478 776.828
## 8 Alabama Calhoun County 15 118572 195.70664 605.866
## 9 Alabama Chambers County 17 34215 57.35662 596.531
## 10 Alabama Cherokee County 19 25989 46.93527 553.720
## .. ... ... ... ... ... ...
## Variables not shown: POPDENSITY2 (dbl)
# Note: the result is not saved like using R build-in approach.
names(USCounties_df)
## [1] "STNAME" "CTYNAME" "SUMLEV"
## [4] "REGION" "DIVISION" "STATE"
## [7] "COUNTY" "CENSUS2010" "POPESTIMATE2013"
## [10] "BIRTHS2013" "DEATHS2013" "INC2013"
## [13] "INTERNATIONALMIG2013" "USPS" "Area"
## [16] "AWATER_SQMI" "POPDENSITY"
6. How many counties are there in the US?
dim(USCounties_df)
## [1] 3144 17
# It's 3144 counties
7. Store just the names of all US counties in a new vector.
USCounties_df_name <- USCounties_df$CTYNAME
head(USCounties_df_name)
## [1] "Autauga County" "Baldwin County" "Barbour County" "Bibb County"
## [5] "Blount County" "Bullock County"
8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)
# Part One: Use dplyr package:
## Find the min and max for the "POPESTIMATE2013"; the "CNAME" does not seem to display correctly:
USCounties_df %>%
#group_by(POPESTIMATE2013) %>%
# summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), select = c(STNAME, CTYNAME, CENSUS2010), matches("CENSUS2010"))
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("POPESTIMATE2013"))
## Source: local data frame [1 x 2]
##
## min max
## 1 90 10017068
# Result: max: 10017068; min: 90
# After found the max pop for the column "POPESTIMATE2013", then look for with Country:" (Los Angeles County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 10017068]
## [1] "Los Angeles County"
# After found the min pop for the column "POPESTIMATE2013", then look for with Country:" (Kalawao County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 90]
## [1] "Kalawao County"
# Part 2: Ram's way (but does not show the nmuber):
maxrow <- which.max(USCounties_df$POPESTIMATE2013)
USCounties_df$CTYNAME[maxrow]
## [1] "Los Angeles County"
# In One row:
USCounties_df$CTYNAME[which.max(USCounties_df$POPESTIMATE2013)]
## [1] "Los Angeles County"
USCounties_df$CTYNAME[which.min(USCounties_df$POPESTIMATE2013)]
## [1] "Kalawao County"
9. List of Counties with area 5000 Sq Miles
bigCounties <- subset(USCounties_df, Area > 5000 )
bigCounties$CTYNAME
## [1] "Aleutians East Borough" "Bethel Census Area"
## [3] "Denali Borough" "Dillingham Census Area"
## [5] "Fairbanks North Star Borough" "Hoonah-Angoon Census Area"
## [7] "Kenai Peninsula Borough" "Kodiak Island Borough"
## [9] "Lake and Peninsula Borough" "Matanuska-Susitna Borough"
## [11] "Nome Census Area" "North Slope Borough"
## [13] "Northwest Arctic Borough" "Southeast Fairbanks Census Area"
## [15] "Valdez-Cordova Census Area" "Wade Hampton Census Area"
## [17] "Yakutat City and Borough" "Yukon-Koyukuk Census Area"
## [19] "Apache County" "Cochise County"
## [21] "Coconino County" "Maricopa County"
## [23] "Mohave County" "Navajo County"
## [25] "Pima County" "Pinal County"
## [27] "Yavapai County" "Yuma County"
## [29] "Fresno County" "Inyo County"
## [31] "Kern County" "Riverside County"
## [33] "San Bernardino County" "Siskiyou County"
## [35] "Idaho County" "Owyhee County"
## [37] "Aroostook County" "St. Louis County"
## [39] "Beaverhead County" "Flathead County"
## [41] "Phillips County" "Rosebud County"
## [43] "Cherry County" "Clark County"
## [45] "Elko County" "Humboldt County"
## [47] "Lander County" "Lincoln County"
## [49] "Nye County" "Pershing County"
## [51] "Washoe County" "White Pine County"
## [53] "Catron County" "Chaves County"
## [55] "McKinley County" "Otero County"
## [57] "Rio Arriba County" "San Juan County"
## [59] "Socorro County" "Douglas County"
## [61] "Harney County" "Klamath County"
## [63] "Lake County" "Malheur County"
## [65] "Brewster County" "Box Elder County"
## [67] "Garfield County" "Millard County"
## [69] "San Juan County" "Tooele County"
## [71] "Okanogan County" "Carbon County"
## [73] "Fremont County" "Natrona County"
## [75] "Park County" "Sweetwater County"
# Or subset(USCounties_df, Area > 5000, select = c(STNAME, CTYNAME, Area) )
subset(USCounties_df, Area > 5000, select = c(STNAME, CTYNAME, Area), )
## Source: local data frame [76 x 3]
##
## STNAME CTYNAME Area
## 1 Alaska Aleutians East Borough 6981.867
## 2 Alaska Bethel Census Area 40570.001
## 3 Alaska Denali Borough 12751.719
## 4 Alaska Dillingham Census Area 18568.839
## 5 Alaska Fairbanks North Star Borough 7338.518
## 6 Alaska Hoonah-Angoon Census Area 7524.904
## 7 Alaska Kenai Peninsula Borough 16075.352
## 8 Alaska Kodiak Island Borough 6549.726
## 9 Alaska Lake and Peninsula Borough 23652.048
## 10 Alaska Matanuska-Susitna Borough 24608.141
## 11 Alaska Nome Census Area 22961.699
## 12 Alaska North Slope Borough 88695.579
## 13 Alaska Northwest Arctic Borough 35767.822
## 14 Alaska Southeast Fairbanks Census Area 24768.569
## 15 Alaska Valdez-Cordova Census Area 34239.955
## 16 Alaska Wade Hampton Census Area 17081.277
## 17 Alaska Yakutat City and Borough 7649.325
## 18 Alaska Yukon-Koyukuk Census Area 145505.165
## 19 Arizona Apache County 11197.521
## 20 Arizona Cochise County 6165.661
## 21 Arizona Coconino County 18618.869
## 22 Arizona Maricopa County 9200.075
## 23 Arizona Mohave County 13310.980
## 24 Arizona Navajo County 9950.435
## 25 Arizona Pima County 9187.004
## 26 Arizona Pinal County 5365.665
## 27 Arizona Yavapai County 8123.477
## 28 Arizona Yuma County 5513.997
## 29 California Fresno County 5958.869
## 30 California Inyo County 10180.938
## 31 California Kern County 8131.951
## 32 California Riverside County 7206.435
## 33 California San Bernardino County 20057.108
## 34 California Siskiyou County 6277.888
## 35 Idaho Idaho County 8477.342
## 36 Idaho Owyhee County 7665.540
## 37 Maine Aroostook County 6671.091
## 38 Minnesota St. Louis County 6247.568
## 39 Montana Beaverhead County 5541.607
## 40 Montana Flathead County 5087.670
## 41 Montana Phillips County 5140.086
## 42 Montana Rosebud County 5010.315
## 43 Nebraska Cherry County 5960.555
## 44 Nevada Clark County 7891.493
## 45 Nevada Elko County 17169.822
## 46 Nevada Humboldt County 9640.776
## 47 Nevada Lander County 5490.092
## 48 Nevada Lincoln County 10633.200
## 49 Nevada Nye County 18181.934
## 50 Nevada Pershing County 6036.599
## 51 Nevada Washoe County 6302.369
## 52 Nevada White Pine County 8875.650
## 53 New Mexico Catron County 6923.674
## 54 New Mexico Chaves County 6065.277
## 55 New Mexico McKinley County 5449.820
## 56 New Mexico Otero County 6613.211
## 57 New Mexico Rio Arriba County 5860.839
## 58 New Mexico San Juan County 5513.091
## 59 New Mexico Socorro County 6646.581
## 60 Oregon Douglas County 5036.459
## 61 Oregon Harney County 10133.215
## 62 Oregon Klamath County 5941.048
## 63 Oregon Lake County 8138.978
## 64 Oregon Malheur County 9887.555
## 65 Texas Brewster County 6183.776
## 66 Utah Box Elder County 5745.552
## 67 Utah Garfield County 5175.096
## 68 Utah Millard County 6595.899
## 69 Utah San Juan County 7820.107
## 70 Utah Tooele County 6941.140
## 71 Washington Okanogan County 5267.998
## 72 Wyoming Carbon County 7897.550
## 73 Wyoming Fremont County 9183.843
## 74 Wyoming Natrona County 5340.512
## 75 Wyoming Park County 6942.034
## 76 Wyoming Sweetwater County 10426.637
# Total 76
10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)
popByQuartiles <- USCounties_df$POPESTIMATE2013
quantile(popByQuartiles)
## 0% 25% 50% 75% 100%
## 90.00 11015.75 25733.00 67582.25 10017068.00
# We can specify which quanties we want:
quantile(USCounties_df$POPESTIMATE2013, probs= c(0.25, 0.5, 0.75))
## 25% 50% 75%
## 11015.75 25733.00 67582.25
# So we know where the boundaries are, according to which we have to group the counties.
firstQ <- quantile(USCounties_df$POPESTIMATE2013)[2]
med <- quantile(USCounties_df$POPESTIMATE2013)[3]
thirdQ <- quantile(USCounties_df$POPESTIMATE2013)[4]
# topmost quartile
head(USCounties_df[USCounties_df$POPESTIMATE2013 > thirdQ, c("CTYNAME", "POPESTIMATE2013") ] )
## Source: local data frame [6 x 2]
##
## CTYNAME POPESTIMATE2013
## 1 Baldwin County 195540
## 2 Calhoun County 116736
## 3 Cullman County 80811
## 4 DeKalb County 71013
## 5 Elmore County 80902
## 6 Etowah County 103931
# Counties between second and third quartile
head(USCounties_df$CTYNAME[(USCounties_df$POPESTIMATE2013 < thirdQ) & (USCounties_df$POPESTIMATE2013 > med)])
## [1] "Autauga County" "Barbour County" "Blount County" "Chambers County"
## [5] "Cherokee County" "Chilton County"
11 I like to find some small towns to live where the population is around 10000
subset(USCounties_df, CENSUS2010 > 10000 & CENSUS2010 < 10100, select = c(STNAME, CTYNAME, CENSUS2010))
## Source: local data frame [7 x 3]
##
## STNAME CTYNAME CENSUS2010
## 1 Colorado Yuma County 10043
## 2 Georgia Lanier County 10078
## 3 Kansas Rice County 10083
## 4 Kentucky Metcalfe County 10099
## 5 Montana Carbon County 10078
## 6 Nevada White Pine County 10030
## 7 Virginia Cumberland County 10052