library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("dplyr", warn.conflicts = FALSE)
Part A (40 marks)
locn <- 'https://www2.census.gov/programs-surveys/popest/tables/2010-2015/state/totals/'
fname <- 'nst-est2015-01.csv'
data = paste(locn, fname, sep="")
nest2 <- read.table(data, header=T, sep=",",skip = 8)
nest1 <- tbl_df(nest2)
nest <- head(nest1, -7)
nest
## # A tibble: 51 x 9
## West X71.945.553 X71.946.907 X72.119.326 X72.844.064
## <fctr> <fctr> <fctr> <fctr> <fctr>
## 1 .Alabama 4,779,736 4,780,127 4,785,161 4,801,108
## 2 .Alaska 710,231 710,249 714,021 722,720
## 3 .Arizona 6,392,017 6,392,307 6,408,208 6,468,732
## 4 .Arkansas 2,915,918 2,915,958 2,922,394 2,938,538
## 5 .California 37,253,956 37,254,503 37,334,079 37,700,034
## 6 .Colorado 5,029,196 5,029,324 5,048,254 5,119,480
## 7 .Connecticut 3,574,097 3,574,118 3,579,717 3,589,759
## 8 .Delaware 897,934 897,936 899,791 907,916
## 9 .District of Columbia 601,723 601,767 605,126 620,472
## 10 .Florida 18,801,310 18,804,623 18,849,890 19,105,533
## # ... with 41 more rows, and 4 more variables: X73.595.996 <fctr>,
## # X74.354.836 <fctr>, X75.179.041 <fctr>, X76.044.679 <fctr>
colnames(nest) <- c("State","census","estbase","est2010","est2011", "est2012", "est2013", "est2014","est2015")
nest
## # A tibble: 51 x 9
## State census estbase est2010 est2011
## <fctr> <fctr> <fctr> <fctr> <fctr>
## 1 .Alabama 4,779,736 4,780,127 4,785,161 4,801,108
## 2 .Alaska 710,231 710,249 714,021 722,720
## 3 .Arizona 6,392,017 6,392,307 6,408,208 6,468,732
## 4 .Arkansas 2,915,918 2,915,958 2,922,394 2,938,538
## 5 .California 37,253,956 37,254,503 37,334,079 37,700,034
## 6 .Colorado 5,029,196 5,029,324 5,048,254 5,119,480
## 7 .Connecticut 3,574,097 3,574,118 3,579,717 3,589,759
## 8 .Delaware 897,934 897,936 899,791 907,916
## 9 .District of Columbia 601,723 601,767 605,126 620,472
## 10 .Florida 18,801,310 18,804,623 18,849,890 19,105,533
## # ... with 41 more rows, and 4 more variables: est2012 <fctr>,
## # est2013 <fctr>, est2014 <fctr>, est2015 <fctr>
write.csv(nest, file = "nstest201501.csv", row.names = FALSE)
nest <- read_csv("nstest201501.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## census = col_number(),
## estbase = col_number(),
## est2010 = col_number(),
## est2011 = col_number(),
## est2012 = col_number(),
## est2013 = col_number(),
## est2014 = col_number(),
## est2015 = col_number()
## )
nest <- tbl_df(nest)
nest
## # A tibble: 51 x 9
## State census estbase est2010 est2011 est2012
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 .Alabama 4779736 4780127 4785161 4801108 4816089
## 2 .Alaska 710231 710249 714021 722720 731228
## 3 .Arizona 6392017 6392307 6408208 6468732 6553262
## 4 .Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 .California 37253956 37254503 37334079 37700034 38056055
## 6 .Colorado 5029196 5029324 5048254 5119480 5191731
## 7 .Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 .Delaware 897934 897936 899791 907916 917099
## 9 .District of Columbia 601723 601767 605126 620472 635342
## 10 .Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <dbl>,
## # est2014 <dbl>, est2015 <dbl>
nest$State <- gsub('.', '', nest$State, fixed=TRUE)
nest
## # A tibble: 51 x 9
## State census estbase est2010 est2011 est2012
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <dbl>,
## # est2014 <dbl>, est2015 <dbl>
write.csv(nest, file = "clean.csv", row.names = FALSE)
Part B (30 marks)
areas <- read_csv("areas.csv")
## Warning: Duplicated column names deduplicated: 'State' => 'State_1' [4],
## 'State' => 'State_2' [6]
## Parsed with column specification:
## cols(
## Rank = col_integer(),
## State = col_character(),
## `Total sq miles` = col_double(),
## State_1 = col_character(),
## `Land sq miles` = col_double(),
## State_2 = col_character(),
## `Water sq miles` = col_double()
## )
areas <- subset(areas, select = c("State", "Land sq miles"))
areas <- tbl_df(areas)
areas
## # A tibble: 50 x 2
## State `Land sq miles`
## <chr> <dbl>
## 1 Alaska 570374
## 2 Texas 261914
## 3 California 155973
## 4 Montana 145556
## 5 New Mexico 121365
## 6 Arizona 113642
## 7 Nevada 109806
## 8 Colorado 103730
## 9 Oregon 97105
## 10 Wyoming 96003
## # ... with 40 more rows
clean <- read_csv("clean.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## census = col_integer(),
## estbase = col_integer(),
## est2010 = col_integer(),
## est2011 = col_integer(),
## est2012 = col_integer(),
## est2013 = col_integer(),
## est2014 = col_integer(),
## est2015 = col_integer()
## )
clean <- tbl_df(clean)
clean
## # A tibble: 51 x 9
## State census estbase est2010 est2011 est2012
## <chr> <int> <int> <int> <int> <int>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <int>,
## # est2014 <int>, est2015 <int>
joined1 <- full_join(clean, areas, by = "State")
joined1
## # A tibble: 51 x 10
## State census estbase est2010 est2011 est2012
## <chr> <int> <int> <int> <int> <int>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 4 more variables: est2013 <int>,
## # est2014 <int>, est2015 <int>, `Land sq miles` <dbl>
joined1 <- mutate(joined1, popden = joined1$est2015/joined1$`Land sq miles`)
joined1
## # A tibble: 51 x 11
## State census estbase est2010 est2011 est2012
## <chr> <int> <int> <int> <int> <int>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 5 more variables: est2013 <int>,
## # est2014 <int>, est2015 <int>, `Land sq miles` <dbl>, popden <dbl>
blueredstates <- read_csv("blueredstates.csv")
## Warning: Duplicated column names deduplicated: 'Sen.' => 'Sen._1' [4]
## Parsed with column specification:
## cols(
## State = col_character(),
## Gov. = col_character(),
## Sen. = col_character(),
## Sen._1 = col_character(),
## Overall = col_character(),
## MedianHouseholdIncome = col_double()
## )
blueredstates <- tbl_df(blueredstates)
blueredstates
## # A tibble: 50 x 6
## State Gov. Sen. Sen._1 Overall MedianHouseholdIncome
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Maryland R D D D 73971
## 2 New Jersey R D D D 71919
## 3 Alaska I R R R 71583
## 4 Connecticut D D D D 70048
## 5 Hawaii D D D D 69592
## 6 Massachusetts R D D D 69160
## 7 New Hampshire D D R D 66532
## 8 Virginia D D D D 64902
## 9 California D D D D 61933
## 10 Minnesota D D D D 61481
## # ... with 40 more rows
joined2 <- full_join(joined1,blueredstates, by = "State")
joined2
## # A tibble: 51 x 16
## State census estbase est2010 est2011 est2012
## <chr> <int> <int> <int> <int> <int>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 10 more variables: est2013 <int>,
## # est2014 <int>, est2015 <int>, `Land sq miles` <dbl>, popden <dbl>,
## # Gov. <chr>, Sen. <chr>, Sen._1 <chr>, Overall <chr>,
## # MedianHouseholdIncome <dbl>
joined2 %>% select("popden","MedianHouseholdIncome","Land sq miles", "Overall") %>% group_by(Overall) %>% summarise_each(funs(mean))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## # A tibble: 3 x 4
## Overall popden MedianHouseholdIncome `Land sq miles`
## <chr> <dbl> <dbl> <dbl>
## 1 D 317.0556 58843.95 51530.18
## 2 R 100.9923 50389.96 85807.86
## 3 <NA> NA NA NA
Part C (30 marks) ## Read clean.csv into another data frame ## Show the data
clean1 <- read_csv("clean.csv")
## Parsed with column specification:
## cols(
## State = col_character(),
## census = col_integer(),
## estbase = col_integer(),
## est2010 = col_integer(),
## est2011 = col_integer(),
## est2012 = col_integer(),
## est2013 = col_integer(),
## est2014 = col_integer(),
## est2015 = col_integer()
## )
clean1
## # A tibble: 51 x 9
## State census estbase est2010 est2011 est2012
## <chr> <int> <int> <int> <int> <int>
## 1 Alabama 4779736 4780127 4785161 4801108 4816089
## 2 Alaska 710231 710249 714021 722720 731228
## 3 Arizona 6392017 6392307 6408208 6468732 6553262
## 4 Arkansas 2915918 2915958 2922394 2938538 2949499
## 5 California 37253956 37254503 37334079 37700034 38056055
## 6 Colorado 5029196 5029324 5048254 5119480 5191731
## 7 Connecticut 3574097 3574118 3579717 3589759 3593541
## 8 Delaware 897934 897936 899791 907916 917099
## 9 District of Columbia 601723 601767 605126 620472 635342
## 10 Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <int>,
## # est2014 <int>, est2015 <int>
clean1 <- select(clean1, c("est2010", "est2011","est2012","est2013","est2014", "est2015"))
clean1
## # A tibble: 51 x 6
## est2010 est2011 est2012 est2013 est2014 est2015
## <int> <int> <int> <int> <int> <int>
## 1 4785161 4801108 4816089 4830533 4846411 4858979
## 2 714021 722720 731228 737442 737046 738432
## 3 6408208 6468732 6553262 6630799 6728783 6828065
## 4 2922394 2938538 2949499 2957957 2966835 2978204
## 5 37334079 37700034 38056055 38414128 38792291 39144818
## 6 5048254 5119480 5191731 5271132 5355588 5456574
## 7 3579717 3589759 3593541 3597168 3594762 3590886
## 8 899791 907916 917099 925353 935968 945934
## 9 605126 620472 635342 649540 659836 672228
## 10 18849890 19105533 19352021 19594467 19905569 20271272
## # ... with 41 more rows
boxplot(clean1)
popmean <- summarise_each(clean1, funs(mean))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
plot(popmean,main="Mean of populations across states over the years",
xlab="Year", ylab="Population",grid(),pch = 19,
x=c("2010", "2011", "2012", "2013", "2014", "2015"), ylim = c(6000000, 6400000))