Load the dplyr package

library("tidyverse")
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library("dplyr", warn.conflicts = FALSE)

Part A (40 marks)

Load the data and show the data

Read the data directly from the website

Ignore the first 8 rows (9th will be included to later change it into the column name)

Remove the last 7 rows that contain irrelevant information

locn <- 'https://www2.census.gov/programs-surveys/popest/tables/2010-2015/state/totals/' 
fname <- 'nst-est2015-01.csv' 
data = paste(locn, fname, sep="") 
nest2 <- read.table(data, header=T, sep=",",skip = 8)
nest1 <- tbl_df(nest2) 
nest <- head(nest1, -7)
nest
## # A tibble: 51 x 9
##                     West X71.945.553 X71.946.907 X72.119.326 X72.844.064
##                   <fctr>      <fctr>      <fctr>      <fctr>      <fctr>
##  1              .Alabama   4,779,736   4,780,127   4,785,161   4,801,108
##  2               .Alaska     710,231     710,249     714,021     722,720
##  3              .Arizona   6,392,017   6,392,307   6,408,208   6,468,732
##  4             .Arkansas   2,915,918   2,915,958   2,922,394   2,938,538
##  5           .California  37,253,956  37,254,503  37,334,079  37,700,034
##  6             .Colorado   5,029,196   5,029,324   5,048,254   5,119,480
##  7          .Connecticut   3,574,097   3,574,118   3,579,717   3,589,759
##  8             .Delaware     897,934     897,936     899,791     907,916
##  9 .District of Columbia     601,723     601,767     605,126     620,472
## 10              .Florida  18,801,310  18,804,623  18,849,890  19,105,533
## # ... with 41 more rows, and 4 more variables: X73.595.996 <fctr>,
## #   X74.354.836 <fctr>, X75.179.041 <fctr>, X76.044.679 <fctr>

Name the columns and show the data

colnames(nest) <- c("State","census","estbase","est2010","est2011", "est2012", "est2013", "est2014","est2015")
nest
## # A tibble: 51 x 9
##                    State     census    estbase    est2010    est2011
##                   <fctr>     <fctr>     <fctr>     <fctr>     <fctr>
##  1              .Alabama  4,779,736  4,780,127  4,785,161  4,801,108
##  2               .Alaska    710,231    710,249    714,021    722,720
##  3              .Arizona  6,392,017  6,392,307  6,408,208  6,468,732
##  4             .Arkansas  2,915,918  2,915,958  2,922,394  2,938,538
##  5           .California 37,253,956 37,254,503 37,334,079 37,700,034
##  6             .Colorado  5,029,196  5,029,324  5,048,254  5,119,480
##  7          .Connecticut  3,574,097  3,574,118  3,579,717  3,589,759
##  8             .Delaware    897,934    897,936    899,791    907,916
##  9 .District of Columbia    601,723    601,767    605,126    620,472
## 10              .Florida 18,801,310 18,804,623 18,849,890 19,105,533
## # ... with 41 more rows, and 4 more variables: est2012 <fctr>,
## #   est2013 <fctr>, est2014 <fctr>, est2015 <fctr>

Save the data frame as a separately named csv file

write.csv(nest, file = "nstest201501.csv", row.names = FALSE)

Solve Problem 1: transform the nonnumeric columns(columns that expect for state) to numeric columns

Show the data

nest <- read_csv("nstest201501.csv")
## Parsed with column specification:
## cols(
##   State = col_character(),
##   census = col_number(),
##   estbase = col_number(),
##   est2010 = col_number(),
##   est2011 = col_number(),
##   est2012 = col_number(),
##   est2013 = col_number(),
##   est2014 = col_number(),
##   est2015 = col_number()
## )
nest <- tbl_df(nest)
nest
## # A tibble: 51 x 9
##                    State   census  estbase  est2010  est2011  est2012
##                    <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1              .Alabama  4779736  4780127  4785161  4801108  4816089
##  2               .Alaska   710231   710249   714021   722720   731228
##  3              .Arizona  6392017  6392307  6408208  6468732  6553262
##  4             .Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           .California 37253956 37254503 37334079 37700034 38056055
##  6             .Colorado  5029196  5029324  5048254  5119480  5191731
##  7          .Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             .Delaware   897934   897936   899791   907916   917099
##  9 .District of Columbia   601723   601767   605126   620472   635342
## 10              .Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <dbl>,
## #   est2014 <dbl>, est2015 <dbl>

Solve Problem 2: Remove the period before the name of every state

Show the data

nest$State <- gsub('.', '', nest$State, fixed=TRUE) 
nest
## # A tibble: 51 x 9
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <dbl>,
## #   est2014 <dbl>, est2015 <dbl>

Save the clean file as a csv file named clean.csv.

write.csv(nest, file = "clean.csv", row.names = FALSE)

Part B (30 marks)

Read areas.csv into a data frame and retain only two columns: State and Land sq miles

Show the data

areas <- read_csv("areas.csv")
## Warning: Duplicated column names deduplicated: 'State' => 'State_1' [4],
## 'State' => 'State_2' [6]
## Parsed with column specification:
## cols(
##   Rank = col_integer(),
##   State = col_character(),
##   `Total sq miles` = col_double(),
##   State_1 = col_character(),
##   `Land sq miles` = col_double(),
##   State_2 = col_character(),
##   `Water sq miles` = col_double()
## )
areas <- subset(areas, select = c("State", "Land sq miles"))
areas <- tbl_df(areas)
areas
## # A tibble: 50 x 2
##         State `Land sq miles`
##         <chr>           <dbl>
##  1     Alaska          570374
##  2      Texas          261914
##  3 California          155973
##  4    Montana          145556
##  5 New Mexico          121365
##  6    Arizona          113642
##  7     Nevada          109806
##  8   Colorado          103730
##  9     Oregon           97105
## 10    Wyoming           96003
## # ... with 40 more rows

Read clean.csv into another data frame and show the data

clean <- read_csv("clean.csv")
## Parsed with column specification:
## cols(
##   State = col_character(),
##   census = col_integer(),
##   estbase = col_integer(),
##   est2010 = col_integer(),
##   est2011 = col_integer(),
##   est2012 = col_integer(),
##   est2013 = col_integer(),
##   est2014 = col_integer(),
##   est2015 = col_integer()
## )
clean <- tbl_df(clean)
clean
## # A tibble: 51 x 9
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <int>    <int>    <int>    <int>    <int>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <int>,
## #   est2014 <int>, est2015 <int>

Join the two data frames into a third data frame named joined1

Show the data

joined1 <- full_join(clean, areas, by = "State")
joined1
## # A tibble: 51 x 10
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <int>    <int>    <int>    <int>    <int>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 4 more variables: est2013 <int>,
## #   est2014 <int>, est2015 <int>, `Land sq miles` <dbl>

Create a new column in joined1 using the mutate command that computes population density (population per sq mile) based on the estimated 2015 population

Show the data

joined1 <- mutate(joined1, popden = joined1$est2015/joined1$`Land sq miles`)
joined1
## # A tibble: 51 x 11
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <int>    <int>    <int>    <int>    <int>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 5 more variables: est2013 <int>,
## #   est2014 <int>, est2015 <int>, `Land sq miles` <dbl>, popden <dbl>

Read blueredstates.csv into another data frame.

Show the data

blueredstates <- read_csv("blueredstates.csv")
## Warning: Duplicated column names deduplicated: 'Sen.' => 'Sen._1' [4]
## Parsed with column specification:
## cols(
##   State = col_character(),
##   Gov. = col_character(),
##   Sen. = col_character(),
##   Sen._1 = col_character(),
##   Overall = col_character(),
##   MedianHouseholdIncome = col_double()
## )
blueredstates <- tbl_df(blueredstates)
blueredstates
## # A tibble: 50 x 6
##            State  Gov.  Sen. Sen._1 Overall MedianHouseholdIncome
##            <chr> <chr> <chr>  <chr>   <chr>                 <dbl>
##  1      Maryland     R     D      D       D                 73971
##  2    New Jersey     R     D      D       D                 71919
##  3        Alaska     I     R      R       R                 71583
##  4   Connecticut     D     D      D       D                 70048
##  5        Hawaii     D     D      D       D                 69592
##  6 Massachusetts     R     D      D       D                 69160
##  7 New Hampshire     D     D      R       D                 66532
##  8      Virginia     D     D      D       D                 64902
##  9    California     D     D      D       D                 61933
## 10     Minnesota     D     D      D       D                 61481
## # ... with 40 more rows

Join it to joined1 and call it joined2.

Show the data

joined2 <- full_join(joined1,blueredstates, by = "State")
joined2
## # A tibble: 51 x 16
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <int>    <int>    <int>    <int>    <int>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 10 more variables: est2013 <int>,
## #   est2014 <int>, est2015 <int>, `Land sq miles` <dbl>, popden <dbl>,
## #   Gov. <chr>, Sen. <chr>, Sen._1 <chr>, Overall <chr>,
## #   MedianHouseholdIncome <dbl>

Using “piping” with %>% take joined2 as the input and pipe it into the group by operation based on the variable “Overall” (that tells you whether a state is a red or a blue state) and provide summary averages for the following variables: popden, MedianHouseholdIncome, and Land sq miles.

joined2 %>% select("popden","MedianHouseholdIncome","Land sq miles", "Overall") %>% group_by(Overall) %>% summarise_each(funs(mean))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## # A tibble: 3 x 4
##   Overall   popden MedianHouseholdIncome `Land sq miles`
##     <chr>    <dbl>                 <dbl>           <dbl>
## 1       D 317.0556              58843.95        51530.18
## 2       R 100.9923              50389.96        85807.86
## 3    <NA>       NA                    NA              NA

Part C (30 marks) ## Read clean.csv into another data frame ## Show the data

clean1 <- read_csv("clean.csv")
## Parsed with column specification:
## cols(
##   State = col_character(),
##   census = col_integer(),
##   estbase = col_integer(),
##   est2010 = col_integer(),
##   est2011 = col_integer(),
##   est2012 = col_integer(),
##   est2013 = col_integer(),
##   est2014 = col_integer(),
##   est2015 = col_integer()
## )
clean1
## # A tibble: 51 x 9
##                   State   census  estbase  est2010  est2011  est2012
##                   <chr>    <int>    <int>    <int>    <int>    <int>
##  1              Alabama  4779736  4780127  4785161  4801108  4816089
##  2               Alaska   710231   710249   714021   722720   731228
##  3              Arizona  6392017  6392307  6408208  6468732  6553262
##  4             Arkansas  2915918  2915958  2922394  2938538  2949499
##  5           California 37253956 37254503 37334079 37700034 38056055
##  6             Colorado  5029196  5029324  5048254  5119480  5191731
##  7          Connecticut  3574097  3574118  3579717  3589759  3593541
##  8             Delaware   897934   897936   899791   907916   917099
##  9 District of Columbia   601723   601767   605126   620472   635342
## 10              Florida 18801310 18804623 18849890 19105533 19352021
## # ... with 41 more rows, and 3 more variables: est2013 <int>,
## #   est2014 <int>, est2015 <int>

Retain only the following variables: “est2010”, “est2011”, “est2012”, “est2013”, “est2014”, and “est2015”.

clean1 <- select(clean1, c("est2010", "est2011","est2012","est2013","est2014", "est2015"))
clean1
## # A tibble: 51 x 6
##     est2010  est2011  est2012  est2013  est2014  est2015
##       <int>    <int>    <int>    <int>    <int>    <int>
##  1  4785161  4801108  4816089  4830533  4846411  4858979
##  2   714021   722720   731228   737442   737046   738432
##  3  6408208  6468732  6553262  6630799  6728783  6828065
##  4  2922394  2938538  2949499  2957957  2966835  2978204
##  5 37334079 37700034 38056055 38414128 38792291 39144818
##  6  5048254  5119480  5191731  5271132  5355588  5456574
##  7  3579717  3589759  3593541  3597168  3594762  3590886
##  8   899791   907916   917099   925353   935968   945934
##  9   605126   620472   635342   649540   659836   672228
## 10 18849890 19105533 19352021 19594467 19905569 20271272
## # ... with 41 more rows

Provide a boxplot for the average population across all states by year. Year is on the x-axis and the average population is on the Y axis – as shown below. No labels required, default labels accepted.

boxplot(clean1)

Make the following graph

popmean <- summarise_each(clean1, funs(mean))
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
plot(popmean,main="Mean of populations across states over the years",
  xlab="Year", ylab="Population",grid(),pch = 19,
  x=c("2010", "2011", "2012", "2013", "2014", "2015"), ylim = c(6000000, 6400000))