Homework #1

1. Download the dataset and read the CSV file: HW1_US_Counties.csv

# Some house keep jobs: 
# Set working directory:

setwd("C:/Users/Andrew/SkyDrive/workspace_R")

dirpath <- "C:/Users/Andrew/SkyDrive/workspace_R/data"
dir(dirpath, full=TRUE)  # make sure the 

# Download and save the HW1_US_Counties.csv in the "C:/Users/Andrew/SkyDrive/workspace_R/data"
# Read the file:

HW1_US_Counties <- read.csv("data/HW1_US_Counties.csv",  stringsAsFactors = FALSE)
#verify:
dim(HW1_US_Counties)
names(HW1_US_Counties)


# some preparation
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.1.2

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.1.2

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plyr)

## Warning: package 'plyr' was built under R version 3.1.2

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

# Covert to local data frame to make it more reaable & presentable
USCounties_df <- tbl_df(HW1_US_Counties)
USCounties_df

# A few utilities to get understand the file:
head(USCounties_df)
tail(USCounties_df)
names(USCounties_df)
length(USCounties_df)
str(USCounties_df)
class(USCounties_df)
dim(USCounties_df)

USCounties_df %>% sample_n(5)
USCounties_df %>% sample_frac(0.25, replac=TRUE)
glimpse(USCounties_df)

2. Rename Column-headers with clearer Names.

For example: 
Rename col #9 from "POPESTIMATE2013" into: "POPEST2013",

# 1. Use dpyr:
# rename col #13 from Col#8 from "CENSUS2010POP" into "CENSUS2010" and col# 12 from "NATURALINC2013" into: "INC2013":
# Note: the rename will be saved into the data set directly!
names(USCounties_df)[names(USCounties_df)=="CENSUS2010POP"] <- "CENSUS2010"
names(USCounties_df)[names(USCounties_df)=="NATURALINC2013"] <- "INC2013"
# We can do this by two steps into one:
# names(uscounties)[c(1,2,8)] <- c("State_Name", "County_Name", "Pop_2010")
head(USCounties_df)

## Source: local data frame [6 x 16]
## 
##    STNAME        CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1 Alabama Autauga County     50      3        6     1      1      54571
## 2 Alabama Baldwin County     50      3        6     1      3     182265
## 3 Alabama Barbour County     50      3        6     1      5      27457
## 4 Alabama    Bibb County     50      3        6     1      7      22915
## 5 Alabama  Blount County     50      3        6     1      9      57322
## 6 Alabama Bullock County     50      3        6     1     11      10914
## Variables not shown: POPESTIMATE2013 (int), BIRTHS2013 (int), DEATHS2013
##   (int), INC2013 (int), INTERNATIONALMIG2013 (int), USPS (chr), ALAND_SQMI
##   (dbl), AWATER_SQMI (dbl)

# 2. Use R built-in function
# rename col #13 from Col#9 from "POPESTIMATE2013" into "POPEST2013" and col# 13 from "INTERNATIONALMIG2013" into: "INTER'LMIG2013":
# Note: the result won't be saved into the data set.
rename(USCounties_df, c("POPESTIMATE2013"="POPEST2013", "INTERNATIONALMIG2013"="INTER'LMIG2013"))

## Source: local data frame [3,144 x 16]
## 
##     STNAME         CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1  Alabama  Autauga County     50      3        6     1      1      54571
## 2  Alabama  Baldwin County     50      3        6     1      3     182265
## 3  Alabama  Barbour County     50      3        6     1      5      27457
## 4  Alabama     Bibb County     50      3        6     1      7      22915
## 5  Alabama   Blount County     50      3        6     1      9      57322
## 6  Alabama  Bullock County     50      3        6     1     11      10914
## 7  Alabama   Butler County     50      3        6     1     13      20947
## 8  Alabama  Calhoun County     50      3        6     1     15     118572
## 9  Alabama Chambers County     50      3        6     1     17      34215
## 10 Alabama Cherokee County     50      3        6     1     19      25989
## ..     ...             ...    ...    ...      ...   ...    ...        ...
## Variables not shown: POPEST2013 (int), BIRTHS2013 (int), DEATHS2013 (int),
##   INC2013 (int), INTER'LMIG2013 (int), USPS (chr), ALAND_SQMI (dbl),
##   AWATER_SQMI (dbl)

names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "ALAND_SQMI"          
## [16] "AWATER_SQMI"

names(USCounties_df)[15]

## [1] "ALAND_SQMI"

## Tam's Solution: We can do this in one command, by passing the column numbers to the names() function.
names(USCounties_df)[15] <- "Area"
names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"

3. Sort All US Counties by Decreasing Area and print it out

# Use R built-in function
# I don't know how to sort the column for the df using R built-in function?
# using dplyr approach:
arrange(USCounties_df, desc(CTYNAME))

## Source: local data frame [3,144 x 16]
## 
##          STNAME                   CTYNAME SUMLEV REGION DIVISION STATE
## 1  South Dakota            Ziebach County     50      2        4    46
## 2         Texas             Zavala County     50      3        7    48
## 3         Texas             Zapata County     50      3        7    48
## 4       Arizona               Yuma County     50      4        8     4
## 5      Colorado               Yuma County     50      4        8     8
## 6        Alaska Yukon-Koyukuk Census Area     50      4        9     2
## 7    California               Yuba County     50      4        9     6
## 8         Texas              Young County     50      3        7    48
## 9         Maine               York County     50      1        1    23
## 10     Nebraska               York County     50      2        4    31
## ..          ...                       ...    ...    ...      ...   ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
##   (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
##   INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)

arrange(USCounties_df, desc(Area))

## Source: local data frame [3,144 x 16]
## 
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## ..        ...                             ...    ...    ...      ...   ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
##   (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
##   INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)

## Ram's way:
# To sort, we use the order() function.
# You can store the sorted data into another data frame, if you wish to.

# Use HW1_US_Counties

USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]

## Source: local data frame [3,144 x 16]
## 
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## ..        ...                             ...    ...    ...      ...   ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
##   (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
##   INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)

head(USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ] ,10)

## Source: local data frame [10 x 16]
## 
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
##   (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
##   INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)

# or: sort "CTYNAME" for the HW1_US_Counties:
head(HW1_US_Counties[order(USCounties_df$CTYNAME, decreasing=TRUE), ], 10)

##            STNAME                   CTYNAME SUMLEV REGION DIVISION STATE
## 2429 South Dakota            Ziebach County     50      2        4    46
## 2778        Texas             Zavala County     50      3        7    48
## 2777        Texas             Zapata County     50      3        7    48
## 111       Arizona               Yuma County     50      4        8     4
## 308      Colorado               Yuma County     50      4        8     8
## 96         Alaska Yukon-Koyukuk Census Area     50      4        9     2
## 244    California               Yuba County     50      4        9     6
## 2776        Texas              Young County     50      3        7    48
## 1194        Maine               York County     50      1        1    23
## 1748     Nebraska               York County     50      2        4    31
##      COUNTY CENSUS2010POP POPESTIMATE2013 BIRTHS2013 DEATHS2013
## 2429    137          2801            2834         39         19
## 2778    507         11677           12156        204         90
## 2777    505         14018           14390        283         94
## 111      27        195751          201201       3147       1414
## 308     125         10043           10151        154         91
## 96      290          5588            5695        107         65
## 244     115         72155           73340       1282        471
## 2776    503         18550           18341        222        231
## 1194     31        197131          199431       1831       1739
## 1748    185         13665           13883        177        130
##      NATURALINC2013 INTERNATIONALMIG2013 USPS ALAND_SQMI AWATER_SQMI
## 2429             20                    2   SD   1961.386       9.348
## 2778            114                   12   TX   1297.406       4.328
## 2777            189                    3   TX    998.412      59.603
## 111            1733                  219   AZ   5513.997       5.087
## 308              63                   15   CO   2364.394       4.299
## 96               42                    0   AK 145505.165    2299.769
## 244             811                   91   CA    632.020      11.969
## 2776             -9                   -2   TX    914.469      16.389
## 1194             92                   59   ME    990.748     278.965
## 1748             47                    8   NE    572.510       3.319

# Same result for by using the dataframe:

USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]

## Source: local data frame [3,144 x 16]
## 
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## ..        ...                             ...    ...    ...      ...   ...
## Variables not shown: COUNTY (int), CENSUS2010 (int), POPESTIMATE2013
##   (int), BIRTHS2013 (int), DEATHS2013 (int), INC2013 (int),
##   INTERNATIONALMIG2013 (int), USPS (chr), Area (dbl), AWATER_SQMI (dbl)

4. Save the sorted data frame in two formats: as a csv file, & as .Rda

write.csv(USCounties_df, file="USCounties_df_new.csv")
save(USCounties_df, file="USCounties_df_new.Rda")

5. Create a new column to calculate the population density of each County.

# 1. R bult-in approach"

USCounties_df$POPDENSITY <- USCounties_df$CENSUS2010 / USCounties_df$Area
names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"          "POPDENSITY"

head(USCounties_df)

## Source: local data frame [6 x 17]
## 
##    STNAME        CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
## 1 Alabama Autauga County     50      3        6     1      1      54571
## 2 Alabama Baldwin County     50      3        6     1      3     182265
## 3 Alabama Barbour County     50      3        6     1      5      27457
## 4 Alabama    Bibb County     50      3        6     1      7      22915
## 5 Alabama  Blount County     50      3        6     1      9      57322
## 6 Alabama Bullock County     50      3        6     1     11      10914
## Variables not shown: POPESTIMATE2013 (int), BIRTHS2013 (int), DEATHS2013
##   (int), INC2013 (int), INTERNATIONALMIG2013 (int), USPS (chr), Area
##   (dbl), AWATER_SQMI (dbl), POPDENSITY (dbl)

# 2. dplyr approach
USCounties_df %>% 
    select(STNAME, CTYNAME, COUNTY, CENSUS2010, POPDENSITY,Area ) %>%  # This line can be ommited.
    mutate(POPDENSITY2 = CENSUS2010/Area)

## Source: local data frame [3,144 x 7]
## 
##     STNAME         CTYNAME COUNTY CENSUS2010 POPDENSITY     Area
## 1  Alabama  Autauga County      1      54571   91.80283  594.437
## 2  Alabama  Baldwin County      3     182265  114.64361 1589.840
## 3  Alabama  Barbour County      5      27457   31.02921  884.876
## 4  Alabama     Bibb County      7      22915   36.80634  622.583
## 5  Alabama   Blount County      9      57322   88.89792  644.807
## 6  Alabama  Bullock County     11      10914   17.52394  622.805
## 7  Alabama   Butler County     13      20947   26.96478  776.828
## 8  Alabama  Calhoun County     15     118572  195.70664  605.866
## 9  Alabama Chambers County     17      34215   57.35662  596.531
## 10 Alabama Cherokee County     19      25989   46.93527  553.720
## ..     ...             ...    ...        ...        ...      ...
## Variables not shown: POPDENSITY2 (dbl)

# Note: the result is not saved like using R build-in approach.

names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"          "POPDENSITY"

6. How many counties are there in the US?

dim(USCounties_df)

## [1] 3144   17

# It's 3144 counties

7. Store just the names of all US counties in a new vector.

USCounties_df_name <- USCounties_df$CTYNAME
head(USCounties_df_name)

## [1] "Autauga County" "Baldwin County" "Barbour County" "Bibb County"   
## [5] "Blount County"  "Bullock County"

8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)

# Part One: Use dplyr package:

## Find the min and max for the "POPESTIMATE2013"; the "CNAME" does not seem to display correctly:
USCounties_df %>%
     #group_by(POPESTIMATE2013) %>%
     # summarise_each(funs(min(., na.rm=TRUE),  max(., na.rm=TRUE)), select = c(STNAME, CTYNAME, CENSUS2010), matches("CENSUS2010"))
     summarise_each(funs(min(., na.rm=TRUE),  max(., na.rm=TRUE)),  matches("POPESTIMATE2013"))

## Source: local data frame [1 x 2]
## 
##   min      max
## 1  90 10017068

# Result: max: 10017068; min: 90
# After found the max pop for the column "POPESTIMATE2013", then look for with Country:" (Los Angeles County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 10017068]

## [1] "Los Angeles County"

# After found the min pop for the column "POPESTIMATE2013", then look for with Country:"  (Kalawao County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 90]

## [1] "Kalawao County"

# Part 2: Ram's way (but does not show the nmuber):
maxrow <- which.max(USCounties_df$POPESTIMATE2013)
USCounties_df$CTYNAME[maxrow]

## [1] "Los Angeles County"

# In One row:
USCounties_df$CTYNAME[which.max(USCounties_df$POPESTIMATE2013)]

## [1] "Los Angeles County"

USCounties_df$CTYNAME[which.min(USCounties_df$POPESTIMATE2013)]

## [1] "Kalawao County"

9. List of Counties with area 5000 Sq Miles

bigCounties <- subset(USCounties_df, Area > 5000 )
bigCounties$CTYNAME

##  [1] "Aleutians East Borough"          "Bethel Census Area"             
##  [3] "Denali Borough"                  "Dillingham Census Area"         
##  [5] "Fairbanks North Star Borough"    "Hoonah-Angoon Census Area"      
##  [7] "Kenai Peninsula Borough"         "Kodiak Island Borough"          
##  [9] "Lake and Peninsula Borough"      "Matanuska-Susitna Borough"      
## [11] "Nome Census Area"                "North Slope Borough"            
## [13] "Northwest Arctic Borough"        "Southeast Fairbanks Census Area"
## [15] "Valdez-Cordova Census Area"      "Wade Hampton Census Area"       
## [17] "Yakutat City and Borough"        "Yukon-Koyukuk Census Area"      
## [19] "Apache County"                   "Cochise County"                 
## [21] "Coconino County"                 "Maricopa County"                
## [23] "Mohave County"                   "Navajo County"                  
## [25] "Pima County"                     "Pinal County"                   
## [27] "Yavapai County"                  "Yuma County"                    
## [29] "Fresno County"                   "Inyo County"                    
## [31] "Kern County"                     "Riverside County"               
## [33] "San Bernardino County"           "Siskiyou County"                
## [35] "Idaho County"                    "Owyhee County"                  
## [37] "Aroostook County"                "St. Louis County"               
## [39] "Beaverhead County"               "Flathead County"                
## [41] "Phillips County"                 "Rosebud County"                 
## [43] "Cherry County"                   "Clark County"                   
## [45] "Elko County"                     "Humboldt County"                
## [47] "Lander County"                   "Lincoln County"                 
## [49] "Nye County"                      "Pershing County"                
## [51] "Washoe County"                   "White Pine County"              
## [53] "Catron County"                   "Chaves County"                  
## [55] "McKinley County"                 "Otero County"                   
## [57] "Rio Arriba County"               "San Juan County"                
## [59] "Socorro County"                  "Douglas County"                 
## [61] "Harney County"                   "Klamath County"                 
## [63] "Lake County"                     "Malheur County"                 
## [65] "Brewster County"                 "Box Elder County"               
## [67] "Garfield County"                 "Millard County"                 
## [69] "San Juan County"                 "Tooele County"                  
## [71] "Okanogan County"                 "Carbon County"                  
## [73] "Fremont County"                  "Natrona County"                 
## [75] "Park County"                     "Sweetwater County"

# Or subset(USCounties_df, Area > 5000,  select = c(STNAME, CTYNAME, Area) )
subset(USCounties_df, Area > 5000,  select = c(STNAME, CTYNAME, Area),  )

## Source: local data frame [76 x 3]
## 
##        STNAME                         CTYNAME       Area
## 1      Alaska          Aleutians East Borough   6981.867
## 2      Alaska              Bethel Census Area  40570.001
## 3      Alaska                  Denali Borough  12751.719
## 4      Alaska          Dillingham Census Area  18568.839
## 5      Alaska    Fairbanks North Star Borough   7338.518
## 6      Alaska       Hoonah-Angoon Census Area   7524.904
## 7      Alaska         Kenai Peninsula Borough  16075.352
## 8      Alaska           Kodiak Island Borough   6549.726
## 9      Alaska      Lake and Peninsula Borough  23652.048
## 10     Alaska       Matanuska-Susitna Borough  24608.141
## 11     Alaska                Nome Census Area  22961.699
## 12     Alaska             North Slope Borough  88695.579
## 13     Alaska        Northwest Arctic Borough  35767.822
## 14     Alaska Southeast Fairbanks Census Area  24768.569
## 15     Alaska      Valdez-Cordova Census Area  34239.955
## 16     Alaska        Wade Hampton Census Area  17081.277
## 17     Alaska        Yakutat City and Borough   7649.325
## 18     Alaska       Yukon-Koyukuk Census Area 145505.165
## 19    Arizona                   Apache County  11197.521
## 20    Arizona                  Cochise County   6165.661
## 21    Arizona                 Coconino County  18618.869
## 22    Arizona                 Maricopa County   9200.075
## 23    Arizona                   Mohave County  13310.980
## 24    Arizona                   Navajo County   9950.435
## 25    Arizona                     Pima County   9187.004
## 26    Arizona                    Pinal County   5365.665
## 27    Arizona                  Yavapai County   8123.477
## 28    Arizona                     Yuma County   5513.997
## 29 California                   Fresno County   5958.869
## 30 California                     Inyo County  10180.938
## 31 California                     Kern County   8131.951
## 32 California                Riverside County   7206.435
## 33 California           San Bernardino County  20057.108
## 34 California                 Siskiyou County   6277.888
## 35      Idaho                    Idaho County   8477.342
## 36      Idaho                   Owyhee County   7665.540
## 37      Maine                Aroostook County   6671.091
## 38  Minnesota                St. Louis County   6247.568
## 39    Montana               Beaverhead County   5541.607
## 40    Montana                 Flathead County   5087.670
## 41    Montana                 Phillips County   5140.086
## 42    Montana                  Rosebud County   5010.315
## 43   Nebraska                   Cherry County   5960.555
## 44     Nevada                    Clark County   7891.493
## 45     Nevada                     Elko County  17169.822
## 46     Nevada                 Humboldt County   9640.776
## 47     Nevada                   Lander County   5490.092
## 48     Nevada                  Lincoln County  10633.200
## 49     Nevada                      Nye County  18181.934
## 50     Nevada                 Pershing County   6036.599
## 51     Nevada                   Washoe County   6302.369
## 52     Nevada               White Pine County   8875.650
## 53 New Mexico                   Catron County   6923.674
## 54 New Mexico                   Chaves County   6065.277
## 55 New Mexico                 McKinley County   5449.820
## 56 New Mexico                    Otero County   6613.211
## 57 New Mexico               Rio Arriba County   5860.839
## 58 New Mexico                 San Juan County   5513.091
## 59 New Mexico                  Socorro County   6646.581
## 60     Oregon                  Douglas County   5036.459
## 61     Oregon                   Harney County  10133.215
## 62     Oregon                  Klamath County   5941.048
## 63     Oregon                     Lake County   8138.978
## 64     Oregon                  Malheur County   9887.555
## 65      Texas                 Brewster County   6183.776
## 66       Utah                Box Elder County   5745.552
## 67       Utah                 Garfield County   5175.096
## 68       Utah                  Millard County   6595.899
## 69       Utah                 San Juan County   7820.107
## 70       Utah                   Tooele County   6941.140
## 71 Washington                 Okanogan County   5267.998
## 72    Wyoming                   Carbon County   7897.550
## 73    Wyoming                  Fremont County   9183.843
## 74    Wyoming                  Natrona County   5340.512
## 75    Wyoming                     Park County   6942.034
## 76    Wyoming               Sweetwater County  10426.637

# Total 76

10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)

popByQuartiles <- USCounties_df$POPESTIMATE2013
quantile(popByQuartiles)

##          0%         25%         50%         75%        100% 
##       90.00    11015.75    25733.00    67582.25 10017068.00

# We can specify which quanties we want:
quantile(USCounties_df$POPESTIMATE2013, probs= c(0.25, 0.5, 0.75))

##      25%      50%      75% 
## 11015.75 25733.00 67582.25

# So we know where the boundaries are, according to which we have to group the counties.
firstQ <- quantile(USCounties_df$POPESTIMATE2013)[2]
med <- quantile(USCounties_df$POPESTIMATE2013)[3]
thirdQ <- quantile(USCounties_df$POPESTIMATE2013)[4]

# topmost quartile
head(USCounties_df[USCounties_df$POPESTIMATE2013 > thirdQ, c("CTYNAME", "POPESTIMATE2013") ]  )

## Source: local data frame [6 x 2]
## 
##          CTYNAME POPESTIMATE2013
## 1 Baldwin County          195540
## 2 Calhoun County          116736
## 3 Cullman County           80811
## 4  DeKalb County           71013
## 5  Elmore County           80902
## 6  Etowah County          103931

# Counties between second and third quartile
head(USCounties_df$CTYNAME[(USCounties_df$POPESTIMATE2013 < thirdQ) & (USCounties_df$POPESTIMATE2013 > med)])

## [1] "Autauga County"  "Barbour County"  "Blount County"   "Chambers County"
## [5] "Cherokee County" "Chilton County"

11 I like to find some small towns to live where the population is around 10000

subset(USCounties_df, CENSUS2010 > 10000 &  CENSUS2010 < 10100, select = c(STNAME, CTYNAME, CENSUS2010))

## Source: local data frame [7 x 3]
## 
##     STNAME           CTYNAME CENSUS2010
## 1 Colorado       Yuma County      10043
## 2  Georgia     Lanier County      10078
## 3   Kansas       Rice County      10083
## 4 Kentucky   Metcalfe County      10099
## 5  Montana     Carbon County      10078
## 6   Nevada White Pine County      10030
## 7 Virginia Cumberland County      10052

Homework #1

Andrew Zhang

Monday, October 20, 2014

1. Download the dataset and read the CSV file: HW1_US_Counties.csv

2. Rename Column-headers with clearer Names.

3. Sort All US Counties by Decreasing Area and print it out

4. Save the sorted data frame in two formats: as a csv file, & as .Rda

5. Create a new column to calculate the population density of each County.

6. How many counties are there in the US?

7. Store just the names of all US counties in a new vector.

8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)

9. List of Counties with area 5000 Sq Miles

10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)

11 I like to find some small towns to live where the population is around 10000