This homework creates a SQLite database called olympics.db using two CSV files, then uses SQL queries to answer the assigned questions.
library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
library(RSQLite)
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Download the datasets and set working directory
athlete_events <- read_csv("athlete_events.csv")
## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Name, Sex, Team, NOC, Games, Season, City, Sport, Event, Medal
## dbl (5): ID, Age, Height, Weight, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
host_city_locations <- read_csv("host_city_locations.csv")
## Rows: 52 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Season, City
## dbl (3): Year, lon, lat
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
con <- dbConnect(SQLite(), "olympics.db")
Write both tables into the database:
#dbWriteTable(con, "athlete_events", athlete_events, overwrite = TRUE)
#dbWriteTable(con, "host_city_locations", host_city_locations, overwrite = TRUE)
Or copy_to as in class activity
copy_to(con, athlete_events, "athlete_events", overwrite = TRUE)
copy_to(con, host_city_locations, "host_city_locations", overwrite = TRUE)
Question 1: How many records are in athlete_events?
SELECT COUNT(*) AS total_records
FROM athlete_events;
| total_records |
|---|
| 271116 |
Question 2: How many different athletes competed in Olympics held in London?
SELECT COUNT(DISTINCT ID) AS different_athletes
FROM athlete_events
WHERE City = 'London';
| different_athletes |
|---|
| 16941 |
Question 3:Which Olympics host city is furthest north?
Method 1: Query and sub-query:
SELECT City, lat
FROM host_city_locations
WHERE CAST(lat AS REAL) = (
SELECT MAX(CAST(lat AS REAL))
FROM host_city_locations
);
| City | lat |
|---|---|
| Lillehammer | 61.11527 |
Method 2: two separate queries
SELECT MAX(CAST(lat AS REAL)) AS max_latitude
FROM host_city_locations;
| max_latitude |
|---|
| 61.11527 |
SELECT City, lat
FROM host_city_locations
ORDER BY CAST(lat AS REAL) DESC
LIMIT 1;
| City | lat |
|---|---|
| Lillehammer | 61.11527 |
Question 4: What is the list of host cities with no duplicates?
SELECT DISTINCT City
FROM host_city_locations
ORDER BY City;
| City |
|---|
| Albertville |
| Amsterdam |
| Antwerpen |
| Athina |
| Atlanta |
| Barcelona |
| Beijing |
| Berlin |
| Calgary |
| Chamonix |
q4 <- "
SELECT DISTINCT City
FROM host_city_locations
ORDER BY City;
"
answer4 <- dbGetQuery(con, q4)
write.csv(answer4, "host_cities_MA_in_R.csv", row.names = FALSE)