Homework #1

1. Download the dataset and read the CSV file: HW1_US_Counties.csv
2. Rename Column-headers with clearer Names.
3. Sort All US Counties by Decreasing Area and print it out
4. Save the sorted data frame in two formats: as a csv file, & as .Rda
5. Create a new column to calculate the population density of each County.
6. How many counties are there in the US?
7. Store just the names of all US counties in a new vector.
8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)
9. List of Counties with area 5000 Sq Miles
10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)
11 I like to find some small towns to live where the population is around 10000

1. Download the dataset and read the CSV file: HW1_US_Counties.csv

# Some house keep jobs: 

# Set working directory:

# 1) For Windows:
#setwd("C:/Users/Andrew/OneDrive/AGZ_Home/workspace_R")

# 2) For Mac:
#setwd("/Volumes/AGZ_Home/workspace_R")

# 3) For linux
#setwd("/mnt/AGZ_Home_vmwin10/workspace_R")
setwd("c:/AGZ1/GD_AGZ1117/AGZ_Home/workspace_R")
list.files()
getwd()
#dirpath <- "C:/Users/Andrew/SkyDrive/workspace_R/data"
#dir(dirpath, full=TRUE)  # make sure the 

# Download and save the HW1_US_Counties.csv in the "C:/Users/Andrew/SkyDrive/workspace_R/data"
# Read the file:


dirpath <- "c:/AGZ1/GD_AGZ1117/AGZ_Home/workspace_R/data"

dir(dirpath, full=TRUE)  # make sure the

HW1_US_Counties <- read.csv("data/HW1_US_Counties.csv",  stringsAsFactors = FALSE)
#verify:
dim(HW1_US_Counties)
names(HW1_US_Counties)

# some preparation
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

# Covert to local data frame to make it more reaable & presentable
USCounties_df <- tbl_df(HW1_US_Counties)
USCounties_df
# HW1_US_Counties

# A few utilities to get understand the file:
head(USCounties_df)
tail(USCounties_df)
names(USCounties_df)
length(USCounties_df)
str(USCounties_df)
class(USCounties_df)
dim(USCounties_df)

USCounties_df %>% sample_n(5)
USCounties_df %>% sample_frac(0.25, replac=TRUE)
glimpse(USCounties_df)

2. Rename Column-headers with clearer Names.

For example: 
Rename col #9 from "POPESTIMATE2013" into: "POPEST2013",

# 1. Use dpyr:
# rename col #13 from Col#8 from "CENSUS2010POP" into "CENSUS2010" and col# 12 from "NATURALINC2013" into: "INC2013":
# Note: the rename will be saved into the data set directly!
names(USCounties_df)[names(USCounties_df)=="CENSUS2010POP"] <- "CENSUS2010"
names(USCounties_df)[names(USCounties_df)=="NATURALINC2013"] <- "INC2013"
# We can do this by two steps into one:
# names(uscounties)[c(1,2,8)] <- c("State_Name", "County_Name", "Pop_2010")
head(USCounties_df)

## # A tibble: 6 × 16
##    STNAME        CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
##     <chr>          <chr>  <int>  <int>    <int> <int>  <int>      <int>
## 1 Alabama Autauga County     50      3        6     1      1      54571
## 2 Alabama Baldwin County     50      3        6     1      3     182265
## 3 Alabama Barbour County     50      3        6     1      5      27457
## 4 Alabama    Bibb County     50      3        6     1      7      22915
## 5 Alabama  Blount County     50      3        6     1      9      57322
## 6 Alabama Bullock County     50      3        6     1     11      10914
## # ... with 8 more variables: POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, ALAND_SQMI <dbl>, AWATER_SQMI <dbl>

# 2. Use R built-in function
# rename col #13 from Col#9 from "POPESTIMATE2013" into "POPEST2013" and col# 13 from "INTERNATIONALMIG2013" into: "INTER'LMIG2013":
# Note: the result won't be saved into the data set.
rename(USCounties_df, c("POPESTIMATE2013"="POPEST2013", "INTERNATIONALMIG2013"="INTER'LMIG2013"))

## # A tibble: 3,144 × 16
##     STNAME         CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
##      <chr>           <chr>  <int>  <int>    <int> <int>  <int>      <int>
## 1  Alabama  Autauga County     50      3        6     1      1      54571
## 2  Alabama  Baldwin County     50      3        6     1      3     182265
## 3  Alabama  Barbour County     50      3        6     1      5      27457
## 4  Alabama     Bibb County     50      3        6     1      7      22915
## 5  Alabama   Blount County     50      3        6     1      9      57322
## 6  Alabama  Bullock County     50      3        6     1     11      10914
## 7  Alabama   Butler County     50      3        6     1     13      20947
## 8  Alabama  Calhoun County     50      3        6     1     15     118572
## 9  Alabama Chambers County     50      3        6     1     17      34215
## 10 Alabama Cherokee County     50      3        6     1     19      25989
## # ... with 3,134 more rows, and 8 more variables: POPEST2013 <int>,
## #   BIRTHS2013 <int>, DEATHS2013 <int>, INC2013 <int>,
## #   `INTER'LMIG2013` <int>, USPS <chr>, ALAND_SQMI <dbl>,
## #   AWATER_SQMI <dbl>

names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "ALAND_SQMI"          
## [16] "AWATER_SQMI"

names(USCounties_df)[15]

## [1] "ALAND_SQMI"

## Tam's Solution: We can do this in one command, by passing the column numbers to the names() function.
names(USCounties_df)[15] <- "Area"
names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"

3. Sort All US Counties by Decreasing Area and print it out

# Use R built-in function
# I don't know how to sort the column for the df using R built-in function?
# using dplyr approach:
arrange(USCounties_df, desc(CTYNAME))

## # A tibble: 3,144 × 16
##          STNAME                   CTYNAME SUMLEV REGION DIVISION STATE
##           <chr>                     <chr>  <int>  <int>    <int> <int>
## 1  South Dakota            Ziebach County     50      2        4    46
## 2         Texas             Zavala County     50      3        7    48
## 3         Texas             Zapata County     50      3        7    48
## 4       Arizona               Yuma County     50      4        8     4
## 5      Colorado               Yuma County     50      4        8     8
## 6        Alaska Yukon-Koyukuk Census Area     50      4        9     2
## 7    California               Yuba County     50      4        9     6
## 8         Texas              Young County     50      3        7    48
## 9         Maine               York County     50      1        1    23
## 10     Nebraska               York County     50      2        4    31
## # ... with 3,134 more rows, and 10 more variables: COUNTY <int>,
## #   CENSUS2010 <int>, POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, Area <dbl>, AWATER_SQMI <dbl>

arrange(USCounties_df, desc(Area))

## # A tibble: 3,144 × 16
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
##         <chr>                           <chr>  <int>  <int>    <int> <int>
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## # ... with 3,134 more rows, and 10 more variables: COUNTY <int>,
## #   CENSUS2010 <int>, POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, Area <dbl>, AWATER_SQMI <dbl>

## Ram's way:
# To sort, we use the order() function.
# You can store the sorted data into another data frame, if you wish to.

# Use HW1_US_Counties

USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]

## # A tibble: 3,144 × 16
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
##         <chr>                           <chr>  <int>  <int>    <int> <int>
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## # ... with 3,134 more rows, and 10 more variables: COUNTY <int>,
## #   CENSUS2010 <int>, POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, Area <dbl>, AWATER_SQMI <dbl>

head(USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ] ,10)

## # A tibble: 10 × 16
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
##         <chr>                           <chr>  <int>  <int>    <int> <int>
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## # ... with 10 more variables: COUNTY <int>, CENSUS2010 <int>,
## #   POPESTIMATE2013 <int>, BIRTHS2013 <int>, DEATHS2013 <int>,
## #   INC2013 <int>, INTERNATIONALMIG2013 <int>, USPS <chr>, Area <dbl>,
## #   AWATER_SQMI <dbl>

# or: sort "CTYNAME" for the HW1_US_Counties:
head(HW1_US_Counties[order(USCounties_df$CTYNAME, decreasing=TRUE), ], 10)

##            STNAME                   CTYNAME SUMLEV REGION DIVISION STATE
## 2429 South Dakota            Ziebach County     50      2        4    46
## 2778        Texas             Zavala County     50      3        7    48
## 2777        Texas             Zapata County     50      3        7    48
## 111       Arizona               Yuma County     50      4        8     4
## 308      Colorado               Yuma County     50      4        8     8
## 96         Alaska Yukon-Koyukuk Census Area     50      4        9     2
## 244    California               Yuba County     50      4        9     6
## 2776        Texas              Young County     50      3        7    48
## 1194        Maine               York County     50      1        1    23
## 1748     Nebraska               York County     50      2        4    31
##      COUNTY CENSUS2010POP POPESTIMATE2013 BIRTHS2013 DEATHS2013
## 2429    137          2801            2834         39         19
## 2778    507         11677           12156        204         90
## 2777    505         14018           14390        283         94
## 111      27        195751          201201       3147       1414
## 308     125         10043           10151        154         91
## 96      290          5588            5695        107         65
## 244     115         72155           73340       1282        471
## 2776    503         18550           18341        222        231
## 1194     31        197131          199431       1831       1739
## 1748    185         13665           13883        177        130
##      NATURALINC2013 INTERNATIONALMIG2013 USPS ALAND_SQMI AWATER_SQMI
## 2429             20                    2   SD   1961.386       9.348
## 2778            114                   12   TX   1297.406       4.328
## 2777            189                    3   TX    998.412      59.603
## 111            1733                  219   AZ   5513.997       5.087
## 308              63                   15   CO   2364.394       4.299
## 96               42                    0   AK 145505.165    2299.769
## 244             811                   91   CA    632.020      11.969
## 2776             -9                   -2   TX    914.469      16.389
## 1194             92                   59   ME    990.748     278.965
## 1748             47                    8   NE    572.510       3.319

# Same result for by using the dataframe:

USCounties_df[order(USCounties_df$Area, decreasing=TRUE), ]

## # A tibble: 3,144 × 16
##        STNAME                         CTYNAME SUMLEV REGION DIVISION STATE
##         <chr>                           <chr>  <int>  <int>    <int> <int>
## 1      Alaska       Yukon-Koyukuk Census Area     50      4        9     2
## 2      Alaska             North Slope Borough     50      4        9     2
## 3      Alaska              Bethel Census Area     50      4        9     2
## 4      Alaska        Northwest Arctic Borough     50      4        9     2
## 5      Alaska      Valdez-Cordova Census Area     50      4        9     2
## 6      Alaska Southeast Fairbanks Census Area     50      4        9     2
## 7      Alaska       Matanuska-Susitna Borough     50      4        9     2
## 8      Alaska      Lake and Peninsula Borough     50      4        9     2
## 9      Alaska                Nome Census Area     50      4        9     2
## 10 California           San Bernardino County     50      4        9     6
## # ... with 3,134 more rows, and 10 more variables: COUNTY <int>,
## #   CENSUS2010 <int>, POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, Area <dbl>, AWATER_SQMI <dbl>

4. Save the sorted data frame in two formats: as a csv file, & as .Rda

write.csv(USCounties_df, file="USCounties_df_new.csv")
save(USCounties_df, file="USCounties_df_new.Rda")

5. Create a new column to calculate the population density of each County.

# 1. R bult-in approach"

USCounties_df$POPDENSITY <- USCounties_df$CENSUS2010 / USCounties_df$Area
names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"          "POPDENSITY"

head(USCounties_df)

## # A tibble: 6 × 17
##    STNAME        CTYNAME SUMLEV REGION DIVISION STATE COUNTY CENSUS2010
##     <chr>          <chr>  <int>  <int>    <int> <int>  <int>      <int>
## 1 Alabama Autauga County     50      3        6     1      1      54571
## 2 Alabama Baldwin County     50      3        6     1      3     182265
## 3 Alabama Barbour County     50      3        6     1      5      27457
## 4 Alabama    Bibb County     50      3        6     1      7      22915
## 5 Alabama  Blount County     50      3        6     1      9      57322
## 6 Alabama Bullock County     50      3        6     1     11      10914
## # ... with 9 more variables: POPESTIMATE2013 <int>, BIRTHS2013 <int>,
## #   DEATHS2013 <int>, INC2013 <int>, INTERNATIONALMIG2013 <int>,
## #   USPS <chr>, Area <dbl>, AWATER_SQMI <dbl>, POPDENSITY <dbl>

# 2. dplyr approach
USCounties_df %>% 
    select(STNAME, CTYNAME, COUNTY, CENSUS2010, POPDENSITY,Area ) %>%  # This line can be ommited.
    mutate(POPDENSITY2 = CENSUS2010/Area)

## # A tibble: 3,144 × 7
##     STNAME         CTYNAME COUNTY CENSUS2010 POPDENSITY     Area
##      <chr>           <chr>  <int>      <int>      <dbl>    <dbl>
## 1  Alabama  Autauga County      1      54571   91.80283  594.437
## 2  Alabama  Baldwin County      3     182265  114.64361 1589.840
## 3  Alabama  Barbour County      5      27457   31.02921  884.876
## 4  Alabama     Bibb County      7      22915   36.80634  622.583
## 5  Alabama   Blount County      9      57322   88.89792  644.807
## 6  Alabama  Bullock County     11      10914   17.52394  622.805
## 7  Alabama   Butler County     13      20947   26.96478  776.828
## 8  Alabama  Calhoun County     15     118572  195.70664  605.866
## 9  Alabama Chambers County     17      34215   57.35662  596.531
## 10 Alabama Cherokee County     19      25989   46.93527  553.720
## # ... with 3,134 more rows, and 1 more variables: POPDENSITY2 <dbl>

# Note: the result is not saved like using R build-in approach.

names(USCounties_df)

##  [1] "STNAME"               "CTYNAME"              "SUMLEV"              
##  [4] "REGION"               "DIVISION"             "STATE"               
##  [7] "COUNTY"               "CENSUS2010"           "POPESTIMATE2013"     
## [10] "BIRTHS2013"           "DEATHS2013"           "INC2013"             
## [13] "INTERNATIONALMIG2013" "USPS"                 "Area"                
## [16] "AWATER_SQMI"          "POPDENSITY"

6. How many counties are there in the US?

dim(USCounties_df)

## [1] 3144   17

# It's 3144 counties

7. Store just the names of all US counties in a new vector.

USCounties_df_name <- USCounties_df$CTYNAME
head(USCounties_df_name)

## [1] "Autauga County" "Baldwin County" "Barbour County" "Bibb County"   
## [5] "Blount County"  "Bullock County"

8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)

# Part One: Use dplyr package:

## Find the min and max for the "POPESTIMATE2013"; the "CNAME" does not seem to display correctly:
USCounties_df %>%
     #group_by(POPESTIMATE2013) %>%
     # summarise_each(funs(min(., na.rm=TRUE),  max(., na.rm=TRUE)), select = c(STNAME, CTYNAME, CENSUS2010), matches("CENSUS2010"))
     summarise_each(funs(min(., na.rm=TRUE),  max(., na.rm=TRUE)),  matches("POPESTIMATE2013"))

## # A tibble: 1 × 2
##     min      max
##   <int>    <int>
## 1    90 10017068

# 2015/01/13:

View(USCounties_df)

# Result: max: 10017068; min: 90
# After found the max pop for the column "POPESTIMATE2013", then look for with Country:" (Los Angeles County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 10017068]

## [1] "Los Angeles County"

# After found the min pop for the column "POPESTIMATE2013", then look for with Country:"  (Kalawao County)
USCounties_df$CTYNAME[USCounties_df$POPESTIMATE2013 == 90]

## [1] "Kalawao County"

# Part 2: Ram's way (but does not show the nmuber):
maxrow <- which.max(USCounties_df$POPESTIMATE2013)
USCounties_df$CTYNAME[maxrow]

## [1] "Los Angeles County"

# In One row:
USCounties_df$CTYNAME[which.max(USCounties_df$POPESTIMATE2013)]

## [1] "Los Angeles County"

USCounties_df$CTYNAME[which.min(USCounties_df$POPESTIMATE2013)]

## [1] "Kalawao County"

9. List of Counties with area 5000 Sq Miles

bigCounties <- subset(USCounties_df, Area > 5000 )
bigCounties$CTYNAME

##  [1] "Aleutians East Borough"          "Bethel Census Area"             
##  [3] "Denali Borough"                  "Dillingham Census Area"         
##  [5] "Fairbanks North Star Borough"    "Hoonah-Angoon Census Area"      
##  [7] "Kenai Peninsula Borough"         "Kodiak Island Borough"          
##  [9] "Lake and Peninsula Borough"      "Matanuska-Susitna Borough"      
## [11] "Nome Census Area"                "North Slope Borough"            
## [13] "Northwest Arctic Borough"        "Southeast Fairbanks Census Area"
## [15] "Valdez-Cordova Census Area"      "Wade Hampton Census Area"       
## [17] "Yakutat City and Borough"        "Yukon-Koyukuk Census Area"      
## [19] "Apache County"                   "Cochise County"                 
## [21] "Coconino County"                 "Maricopa County"                
## [23] "Mohave County"                   "Navajo County"                  
## [25] "Pima County"                     "Pinal County"                   
## [27] "Yavapai County"                  "Yuma County"                    
## [29] "Fresno County"                   "Inyo County"                    
## [31] "Kern County"                     "Riverside County"               
## [33] "San Bernardino County"           "Siskiyou County"                
## [35] "Idaho County"                    "Owyhee County"                  
## [37] "Aroostook County"                "St. Louis County"               
## [39] "Beaverhead County"               "Flathead County"                
## [41] "Phillips County"                 "Rosebud County"                 
## [43] "Cherry County"                   "Clark County"                   
## [45] "Elko County"                     "Humboldt County"                
## [47] "Lander County"                   "Lincoln County"                 
## [49] "Nye County"                      "Pershing County"                
## [51] "Washoe County"                   "White Pine County"              
## [53] "Catron County"                   "Chaves County"                  
## [55] "McKinley County"                 "Otero County"                   
## [57] "Rio Arriba County"               "San Juan County"                
## [59] "Socorro County"                  "Douglas County"                 
## [61] "Harney County"                   "Klamath County"                 
## [63] "Lake County"                     "Malheur County"                 
## [65] "Brewster County"                 "Box Elder County"               
## [67] "Garfield County"                 "Millard County"                 
## [69] "San Juan County"                 "Tooele County"                  
## [71] "Okanogan County"                 "Carbon County"                  
## [73] "Fremont County"                  "Natrona County"                 
## [75] "Park County"                     "Sweetwater County"

# Or subset(USCounties_df, Area > 5000,  select = c(STNAME, CTYNAME, Area) )
subset(USCounties_df, Area > 5000,  select = c(STNAME, CTYNAME, Area),  )

## # A tibble: 76 × 3
##    STNAME                      CTYNAME      Area
##     <chr>                        <chr>     <dbl>
## 1  Alaska       Aleutians East Borough  6981.867
## 2  Alaska           Bethel Census Area 40570.001
## 3  Alaska               Denali Borough 12751.719
## 4  Alaska       Dillingham Census Area 18568.839
## 5  Alaska Fairbanks North Star Borough  7338.518
## 6  Alaska    Hoonah-Angoon Census Area  7524.904
## 7  Alaska      Kenai Peninsula Borough 16075.352
## 8  Alaska        Kodiak Island Borough  6549.726
## 9  Alaska   Lake and Peninsula Borough 23652.048
## 10 Alaska    Matanuska-Susitna Borough 24608.141
## # ... with 66 more rows

# Total 76

10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)

popByQuartiles <- USCounties_df$POPESTIMATE2013
quantile(popByQuartiles)

##          0%         25%         50%         75%        100% 
##       90.00    11015.75    25733.00    67582.25 10017068.00

# We can specify which quanties we want:
quantile(USCounties_df$POPESTIMATE2013, probs= c(0.25, 0.5, 0.75))

##      25%      50%      75% 
## 11015.75 25733.00 67582.25

# So we know where the boundaries are, according to which we have to group the counties.
firstQ <- quantile(USCounties_df$POPESTIMATE2013)[2]
med <- quantile(USCounties_df$POPESTIMATE2013)[3]
thirdQ <- quantile(USCounties_df$POPESTIMATE2013)[4]

# topmost quartile
head(USCounties_df[USCounties_df$POPESTIMATE2013 > thirdQ, c("CTYNAME", "POPESTIMATE2013") ]  )

## # A tibble: 6 × 2
##          CTYNAME POPESTIMATE2013
##            <chr>           <int>
## 1 Baldwin County          195540
## 2 Calhoun County          116736
## 3 Cullman County           80811
## 4  DeKalb County           71013
## 5  Elmore County           80902
## 6  Etowah County          103931

# Counties between second and third quartile
head(USCounties_df$CTYNAME[(USCounties_df$POPESTIMATE2013 < thirdQ) & (USCounties_df$POPESTIMATE2013 > med)])

## [1] "Autauga County"  "Barbour County"  "Blount County"   "Chambers County"
## [5] "Cherokee County" "Chilton County"

11 I like to find some small towns to live where the population is around 10000

subset(USCounties_df, CENSUS2010 > 10000 &  CENSUS2010 < 10100, select = c(STNAME, CTYNAME, CENSUS2010))

## # A tibble: 7 × 3
##     STNAME           CTYNAME CENSUS2010
##      <chr>             <chr>      <int>
## 1 Colorado       Yuma County      10043
## 2  Georgia     Lanier County      10078
## 3   Kansas       Rice County      10083
## 4 Kentucky   Metcalfe County      10099
## 5  Montana     Carbon County      10078
## 6   Nevada White Pine County      10030
## 7 Virginia Cumberland County      10052

Homework #1

Andrew Zhang

06/06/2017: Added table of content as per: https://stackoverflow.com/questions/23957278/how-to-add-table-of-contents-in-rmarkdown

1. Download the dataset and read the CSV file: HW1_US_Counties.csv

2. Rename Column-headers with clearer Names.

3. Sort All US Counties by Decreasing Area and print it out

4. Save the sorted data frame in two formats: as a csv file, & as .Rda

5. Create a new column to calculate the population density of each County.

6. How many counties are there in the US?

7. Store just the names of all US counties in a new vector.

8. Which counties have the most least people living in them? (Use column “POPESTIMATE2013”)

9. List of Counties with area 5000 Sq Miles

10. Population distribution by Quartiles (Which counties are in the bottom 25%, top 25% etc)

11 I like to find some small towns to live where the population is around 10000