library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
census <- read_csv("census.csv")
## Rows: 306 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): state, postal_code, region, division
## dbl (3): year, population, land_area
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[COMPLETED] Open an R Markdown file [COMPLETED]
Print the first few lines of census to the console using the head() function
head(census)
## # A tibble: 6 × 7
## state year population land_area postal_code region division
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Georgia 2012 9901430 59425 GA South South Atlantic
## 2 West Virginia 2013 1853914 24230 WV South South Atlantic
## 3 Alaska 2010 713910 665384 AK West Pacific
## 4 Kansas 2013 2893212 82278 KS Midwest West North Centr…
## 5 Maryland 2010 5788645 12406 MD South South Atlantic
## 6 Colorado 2015 5450623 104094 CO West Mountain
# state: Categorical
# year: Quantitative
# population: Quantitative
# land_area: Quantitative
# postal_code: Categorical
# region: Categorical
# division: Categorical
# Note: I classified the year as quantitative due to its numerical nature. In this dataset, the year can be seen as a numerical value representing the time of the census data.
census <- arrange(census, region, year, desc(population))
head(census)
## # A tibble: 6 × 7
## state year population land_area postal_code region division
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Illinois 2010 12840503 57914 IL Midwest East North Central
## 2 Ohio 2010 11539336 44826 OH Midwest East North Central
## 3 Michigan 2010 9877510 96714 MI Midwest East North Central
## 4 Indiana 2010 6490432 36420 IN Midwest East North Central
## 5 Missouri 2010 5995974 69707 MO Midwest West North Central
## 6 Wisconsin 2010 5690475 65496 WI Midwest East North Central
census <- select(census, -postal_code)
head(census)
## # A tibble: 6 × 6
## state year population land_area region division
## <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Illinois 2010 12840503 57914 Midwest East North Central
## 2 Ohio 2010 11539336 44826 Midwest East North Central
## 3 Michigan 2010 9877510 96714 Midwest East North Central
## 4 Indiana 2010 6490432 36420 Midwest East North Central
## 5 Missouri 2010 5995974 69707 Midwest West North Central
## 6 Wisconsin 2010 5690475 65496 Midwest East North Central
census <- mutate(census, density = population / land_area)
head(census)
## # A tibble: 6 × 7
## state year population land_area region division density
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Illinois 2010 12840503 57914 Midwest East North Central 222.
## 2 Ohio 2010 11539336 44826 Midwest East North Central 257.
## 3 Michigan 2010 9877510 96714 Midwest East North Central 102.
## 4 Indiana 2010 6490432 36420 Midwest East North Central 178.
## 5 Missouri 2010 5995974 69707 Midwest West North Central 86.0
## 6 Wisconsin 2010 5690475 65496 Midwest East North Central 86.9
census <- mutate(census, is_large = (land_area > 2000000))
head(census)
## # A tibble: 6 × 8
## state year population land_area region division density is_large
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <lgl>
## 1 Illinois 2010 12840503 57914 Midwest East North Cent… 222. FALSE
## 2 Ohio 2010 11539336 44826 Midwest East North Cent… 257. FALSE
## 3 Michigan 2010 9877510 96714 Midwest East North Cent… 102. FALSE
## 4 Indiana 2010 6490432 36420 Midwest East North Cent… 178. FALSE
## 5 Missouri 2010 5995974 69707 Midwest West North Cent… 86.0 FALSE
## 6 Wisconsin 2010 5690475 65496 Midwest East North Cent… 86.9 FALSE
census <- filter(census, year == 2015)
head(census)
## # A tibble: 6 × 8
## state year population land_area region division density is_large
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <lgl>
## 1 Illinois 2015 12858913 57914 Midwest East North Cent… 222. FALSE
## 2 Ohio 2015 11617527 44826 Midwest East North Cent… 259. FALSE
## 3 Michigan 2015 9931715 96714 Midwest East North Cent… 103. FALSE
## 4 Indiana 2015 6608422 36420 Midwest East North Cent… 181. FALSE
## 5 Missouri 2015 6071732 69707 Midwest West North Cent… 87.1 FALSE
## 6 Wisconsin 2015 5760940 65496 Midwest East North Cent… 88.0 FALSE
sum_population <- census %>%
group_by(region) %>%
summarize(total_population = sum(population))
head(sum_population)
## # A tibble: 4 × 2
## region total_population
## <chr> <dbl>
## 1 Midwest 67860583
## 2 Northeast 56034684
## 3 South 120997341
## 4 West 75742555
# Answer: The region that had the highest summed population in 2015 was the South.
census_west <- census %>%
filter(region == "West") %>%
group_by(division) %>%
summarise(mean_density = mean(density))
print(census_west)
## # A tibble: 2 × 2
## division mean_density
## <chr> <dbl>
## 1 Mountain 27.9
## 2 Pacific 102.