# Enter your name here: Gil Raitses
# 1. I did this homework by myself, with help from the book and the professor.
Last assignment we explored data visualization in R using the ggplot2 package. This homework continues to use ggplot, but this time, with maps. In addition, we will merge datasets using the built-in merge( ) function, which provides a similar capability to a JOIN in SQL (don’t worry if you do not know SQL). Many analytical strategies require joining data from different sources based on a “key” – a field that two datasets have in common.
Examine the resulting pop dataframe and add comments explaining what each column contains.
#install.packages("jsonlite")
#install.packages("dplyr")
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- "https://intro-datascience.s3.us-east-2.amazonaws.com/cities.json"
pop <- fromJSON(url)
head(pop)
## city growth_from_2000_to_2013 latitude longitude population rank
## 1 New York 4.8% 40.71278 -74.00594 8405837 1
## 2 Los Angeles 4.8% 34.05223 -118.24368 3884307 2
## 3 Chicago -6.1% 41.87811 -87.62980 2718782 3
## 4 Houston 11.0% 29.76043 -95.36980 2195914 4
## 5 Philadelphia 2.6% 39.95258 -75.16522 1553165 5
## 6 Phoenix 14.0% 33.44838 -112.07404 1513367 6
## state
## 1 New York
## 2 California
## 3 Illinois
## 4 Texas
## 5 Pennsylvania
## 6 Arizona
str(pop)
## 'data.frame': 1000 obs. of 7 variables:
## $ city : chr "New York" "Los Angeles" "Chicago" "Houston" ...
## $ growth_from_2000_to_2013: chr "4.8%" "4.8%" "-6.1%" "11.0%" ...
## $ latitude : num 40.7 34.1 41.9 29.8 40 ...
## $ longitude : num -74 -118.2 -87.6 -95.4 -75.2 ...
## $ population : chr "8405837" "3884307" "2718782" "2195914" ...
## $ rank : chr "1" "2" "3" "4" ...
## $ state : chr "New York" "California" "Illinois" "Texas" ...
# Converts 'population' and 'rank' to numeric
pop$population <- as.numeric(gsub(",", "", pop$population))
pop$rank <- as.numeric(pop$rank)
# Renames the 'growth_from_2000_to_2013' column to 'growth_from_2000_to_2013_percent' and remove the '%' sign, converting the values to numeric
pop <- pop %>%
rename(growth_from_2000_to_2013_percent = growth_from_2000_to_2013) %>%
mutate(growth_from_2000_to_2013_percent = as.numeric(gsub("%", "", growth_from_2000_to_2013_percent)))
# Shows the cleaned data
head(pop)
## city growth_from_2000_to_2013_percent latitude longitude population
## 1 New York 4.8 40.71278 -74.00594 8405837
## 2 Los Angeles 4.8 34.05223 -118.24368 3884307
## 3 Chicago -6.1 41.87811 -87.62980 2718782
## 4 Houston 11.0 29.76043 -95.36980 2195914
## 5 Philadelphia 2.6 39.95258 -75.16522 1553165
## 6 Phoenix 14.0 33.44838 -112.07404 1513367
## rank state
## 1 1 New York
## 2 2 California
## 3 3 Illinois
## 4 4 Texas
## 5 5 Pennsylvania
## 6 6 Arizona
str(pop)
## 'data.frame': 1000 obs. of 7 variables:
## $ city : chr "New York" "Los Angeles" "Chicago" "Houston" ...
## $ growth_from_2000_to_2013_percent: num 4.8 4.8 -6.1 11 2.6 14 21 10.5 5.6 10.5 ...
## $ latitude : num 40.7 34.1 41.9 29.8 40 ...
## $ longitude : num -74 -118.2 -87.6 -95.4 -75.2 ...
## $ population : num 8405837 3884307 2718782 2195914 1553165 ...
## $ rank : num 1 2 3 4 5 6 7 8 9 10 ...
## $ state : chr "New York" "California" "Illinois" "Texas" ...
# Summary of columns in the pop dataframe
# city: The name of the city.
# growth_from_2000_to_2013_percent: The percentage growth of the city's population from the year 2000 to 2013. This is stored as a character string, which includes the percentage symbol.
# latitude: The latitude coordinate of the city (numeric value).
# longitude: The longitude coordinate of the city (numeric value).
# population: The population of the city, stored as a character string. This may need to be converted to numeric for further analysis.
# rank: The rank of the city based on its population size, stored as a character string.
# state: The name of the state where the city is located.
Hint: use str(pop) or glimpse(pop) to help understand the dataframe
# Checks the current data type of the 'population' column
str(pop$population)
## num [1:1000] 8405837 3884307 2718782 2195914 1553165 ...
# Converts the 'population' column to numeric
pop$population <- as.numeric(gsub(",", "", pop$population))
# Computes average population
average_population <- mean(pop$population, na.rm = TRUE)
# Prints result
average_population
## [1] 131132.4
# The 'population' column includes commas in the numbers, which inhibits direct conversion to numeric. The gsub() function is used to eliminate commas and then transform the cleaned string into numeric format. The na.rm = TRUE argument in mean() is used to handle any NA values that might be present after conversion.
# Identifies the row with the minimum population
smallest_city <- pop[which.min(pop$population), ]
# Extracts the population and state and city
smallest_population <- smallest_city$population
state_of_smallest_city <- smallest_city$state
smallest_city_name <- smallest_city$city
# Prints results
smallest_population
## [1] 36877
state_of_smallest_city
## [1] "Florida"
smallest_city_name
## [1] "Panama City"
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Define the URL
url <- "https://intro-datascience.s3.us-east-2.amazonaws.com/statesInfo.csv"
# Read the CSV file into a dataframe named 'abbr'
abbr <- read_csv(url)
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abbreviation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows of the dataframe
head(abbr)
## # A tibble: 6 × 2
## State Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
# Ensuring the dplyr package is loaded
library(dplyr)
# Rename the 'State' column to 'state' in the 'abbr' dataframe
abbr <- abbr %>%
rename(state = State)
# Then show the first few rows of the updated 'abbr' dataframe
head(abbr)
## # A tibble: 6 × 2
## state Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
# Merges the 'pop' and 'abbr' dataframes on the 'state' column
dfNew <- merge(pop, abbr, by = "state")
# Displays the first few rows of the merged dataframe
head(dfNew)
## state city growth_from_2000_to_2013_percent latitude longitude
## 1 Alabama Auburn 26.4 32.60986 -85.48078
## 2 Alabama Florence 10.2 34.79981 -87.67725
## 3 Alabama Huntsville 16.3 34.73037 -86.58610
## 4 Alabama Dothan 16.6 31.22323 -85.39049
## 5 Alabama Birmingham -12.3 33.52066 -86.80249
## 6 Alabama Phenix City 31.9 32.47098 -85.00077
## population rank Abbreviation
## 1 58582 615 AL
## 2 40059 922 AL
## 3 186254 126 AL
## 4 68001 502 AL
## 5 212113 101 AL
## 6 37498 983 AL
# Summary of columns in the dfNew datafrme
# state: The name of the state where the city is located (character).
# city: The name of the city (character).
# growth_from_2000_to_2013_percent: The percentage growth of the city's population from the year 2000 to 2013. This is stored as a numeric value.
# latitude: The latitude coordinate of the city (numeric).
# longitude: The longitude coordinate of the city (numeric).
# population: The population of the city, stored as a numeric value.
# rank: The rank of the city based on its population size, stored as a numeric value.
# Abbreviation: The abbreviation of the state where the city is located (character).
# Installs and loads necessary packages
#install.packages("ggplot2")
#install.packages("maps")
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
# Loads the map data for the US and store it in the us_map variable
us_map <- map_data("state")
# Plot the map
ggplot() +
# Drawing the map of the US
geom_polygon(data = us_map, aes(x = long, y = lat, group = group), fill = "white", color = "black") +
# Adding points for each city, with color representing population
geom_point(data = dfNew, aes(x = longitude, y = latitude, color = population), size = 2) +
# And adjustsing color scale and add labels
scale_color_gradient(low = "yellow", high = "red") +
labs(title = "US Cities Population Map", x = "Longitude", y = "Latitude", color = "Population") +
theme_minimal()
# Critique of map:
# Color gradient lacks contrast.
# Overlapping points obscure cities.
# Projection doesn't fit US geography.
# Outliers indicate data errors.
# Uniform point size hides population differences.
# Group by state and summarize the population
dfSimple <- dfNew %>%
group_by(state) %>%
summarize(total_population = sum(population, na.rm = TRUE))
# Print the dataframe
head(dfSimple)
## # A tibble: 6 × 2
## state total_population
## <chr> <dbl>
## 1 Alabama 1279813
## 2 Alaska 300950
## 3 Arizona 4691466
## 4 Arkansas 787011
## 5 California 27910620
## 6 Colorado 3012284
# Index the most populous state and store in the variable 'most_populous_state'
most_populous_state <- dfSimple[which.max(dfSimple$total_population), ]
# Index the least populous state and store in the variable 'least_populous_state'
least_populous_state <- dfSimple[which.min(dfSimple$total_population), ]
# Print the results
most_populous_state
## # A tibble: 1 × 2
## state total_population
## <chr> <dbl>
## 1 California 27910620
least_populous_state
## # A tibble: 1 × 2
## state total_population
## <chr> <dbl>
## 1 Vermont 42284
# Install and load necessary packages
#install.packages("mapdata")
library(ggplot2)
library(maps)
library(mapdata)
# Get the map data for the US
us_map <- map_data("state")
# Convert state names in dfSimple to lowercase before merging
dfSimple$state <- tolower(dfSimple$state)
# Merge the map data with the state population data
map_df <- merge(us_map, dfSimple, by.x = "region", by.y = "state", all.x = TRUE)
# Ensure the data is sorted correctly for ggplot
map_df <- map_df[order(map_df$order), ]
# Create the plot
ggplot(map_df, aes(x = long, y = lat, group = group, fill = total_population)) +
geom_polygon(color = "peachpuff2") +
scale_fill_continuous(low = "indianred", high = "steelblue", name = "Population") +
labs(title = "US State Population Map") +
coord_map("albers", lat0 = 39, lat1 = 45) +
theme_minimal()
# coord_map() is used to set the map projection and adjust display parameters.
# "albers" is chosen as the projection because it is suitable for mapping large areas that are east-west in extent, like the United States.
# lat0 = 39 and lat1 = 45 are standard parallels that help to minimize distortion in the region of interest (the continental US).
# lat0 is roughly the latitude of southern Pennsylvania or northern Virginia.
# lat1 is approximately the latitude of northern Michigan or southern Oregon.
# By using these values, the Albers projection will better represent the area between these latitudes, reducing distortion in the most populated and geographically significant regions of the United States.