library(openintro)## Warning: package 'openintro' was built under R version 4.2.3
Week 1 Assignment - Loading Data into a Data Frame.We are often tasked with taking data in one form and transforming it for easier downstream analysis. We will spend several weeks in this course on tidying and transformation operations.
This directory contains various demographic data about the United States Senate and House of Representatives over time. It’s been used in the following FiveThirtyEight articles:
Congress Today Is Older Than It’s Ever Been, by Geoffrey Skelley (April 3, 2023): https://fivethirtyeight.com/features/aging-congress-boomers/
Dataset found here: https://github.com/fivethirtyeight/data/blob/master/congress-demographics/data_aging_congress.csv
library(dplyr)## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (readr)## Warning: package 'readr' was built under R version 4.2.3
urlfile="https://raw.githubusercontent.com/fivethirtyeight/data/master/congress-demographics/data_aging_congress.csv"
mydata<-read_csv(url(urlfile))## Rows: 29120 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): chamber, state_abbrev, bioname, bioguide_id, generation
## dbl (6): congress, party_code, cmltv_cong, cmltv_chamber, age_days, age_years
## date (2): start_date, birthday
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(mydata)## # A tibble: 6 × 13
## congress start_date chamber state_abbrev party_code bioname bioguide_id
## <dbl> <date> <chr> <chr> <dbl> <chr> <chr>
## 1 82 1951-01-03 House ND 200 AANDAHL, Fred… A000001
## 2 80 1947-01-03 House VA 100 ABBITT, Watki… A000002
## 3 81 1949-01-03 House VA 100 ABBITT, Watki… A000002
## 4 82 1951-01-03 House VA 100 ABBITT, Watki… A000002
## 5 83 1953-01-03 House VA 100 ABBITT, Watki… A000002
## 6 84 1955-01-03 House VA 100 ABBITT, Watki… A000002
## # ℹ 6 more variables: birthday <date>, cmltv_cong <dbl>, cmltv_chamber <dbl>,
## # age_days <dbl>, age_years <dbl>, generation <chr>
str(mydata)## spc_tbl_ [29,120 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ congress : num [1:29120] 82 80 81 82 83 84 85 86 87 88 ...
## $ start_date : Date[1:29120], format: "1951-01-03" "1947-01-03" ...
## $ chamber : chr [1:29120] "House" "House" "House" "House" ...
## $ state_abbrev : chr [1:29120] "ND" "VA" "VA" "VA" ...
## $ party_code : num [1:29120] 200 100 100 100 100 100 100 100 100 100 ...
## $ bioname : chr [1:29120] "AANDAHL, Fred George" "ABBITT, Watkins Moorman" "ABBITT, Watkins Moorman" "ABBITT, Watkins Moorman" ...
## $ bioguide_id : chr [1:29120] "A000001" "A000002" "A000002" "A000002" ...
## $ birthday : Date[1:29120], format: "1897-04-09" "1908-05-21" ...
## $ cmltv_cong : num [1:29120] 1 1 2 3 4 5 6 7 8 9 ...
## $ cmltv_chamber: num [1:29120] 1 1 2 3 4 5 6 7 8 9 ...
## $ age_days : num [1:29120] 19626 14106 14837 15567 16298 ...
## $ age_years : num [1:29120] 53.7 38.6 40.6 42.6 44.6 ...
## $ generation : chr [1:29120] "Lost" "Greatest" "Greatest" "Greatest" ...
## - attr(*, "spec")=
## .. cols(
## .. congress = col_double(),
## .. start_date = col_date(format = ""),
## .. chamber = col_character(),
## .. state_abbrev = col_character(),
## .. party_code = col_double(),
## .. bioname = col_character(),
## .. bioguide_id = col_character(),
## .. birthday = col_date(format = ""),
## .. cmltv_cong = col_double(),
## .. cmltv_chamber = col_double(),
## .. age_days = col_double(),
## .. age_years = col_double(),
## .. generation = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
I will rename columns for a more clear understanding and for the purposes of this exercise.
head(mydata)## # A tibble: 6 × 13
## congress start_date chamber state_abbrev party_code bioname bioguide_id
## <dbl> <date> <chr> <chr> <dbl> <chr> <chr>
## 1 82 1951-01-03 House ND 200 AANDAHL, Fred… A000001
## 2 80 1947-01-03 House VA 100 ABBITT, Watki… A000002
## 3 81 1949-01-03 House VA 100 ABBITT, Watki… A000002
## 4 82 1951-01-03 House VA 100 ABBITT, Watki… A000002
## 5 83 1953-01-03 House VA 100 ABBITT, Watki… A000002
## 6 84 1955-01-03 House VA 100 ABBITT, Watki… A000002
## # ℹ 6 more variables: birthday <date>, cmltv_cong <dbl>, cmltv_chamber <dbl>,
## # age_days <dbl>, age_years <dbl>, generation <chr>
#Renaming columns
df<-mydata %>%
dplyr::rename("Congress" = "congress",
"Start_date" = "start_date",
"Chamber" = "chamber",
"State" = "state_abbrev",
"Party_code" = "party_code",
"Name" = "bioname",
"ID" = "bioguide_id",
"Birthday" = "birthday",
"Number_congress" = "cmltv_cong",
"Number_chamber" = "cmltv_chamber",
"Age_days" = "age_days",
"Age_years" = "age_years",
"Generation" = "generation")Looking at the congress statistics.
summary(df)## Congress Start_date Chamber State
## Min. : 66.00 Min. :1919-03-04 Length:29120 Length:29120
## 1st Qu.: 79.00 1st Qu.:1945-01-03 Class :character Class :character
## Median : 92.00 Median :1971-01-03 Mode :character Mode :character
## Mean : 91.88 Mean :1970-10-18
## 3rd Qu.:105.00 3rd Qu.:1997-01-03
## Max. :118.00 Max. :2023-01-03
## Party_code Name ID Birthday
## Min. :100.0 Length:29120 Length:29120 Min. :1835-06-10
## 1st Qu.:100.0 Class :character Class :character 1st Qu.:1891-12-21
## Median :100.0 Mode :character Mode :character Median :1918-11-22
## Mean :146.7 Mean :1917-01-24
## 3rd Qu.:200.0 3rd Qu.:1943-05-16
## Max. :537.0 Max. :1997-01-17
## Number_congress Number_chamber Age_days Age_years
## Min. : 1.000 Min. : 1.000 Min. : 8644 Min. :23.67
## 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:16732 1st Qu.:45.81
## Median : 4.000 Median : 4.000 Median :19523 Median :53.45
## Mean : 5.414 Mean : 5.112 Mean :19626 Mean :53.73
## 3rd Qu.: 8.000 3rd Qu.: 7.000 3rd Qu.:22359 3rd Qu.:61.22
## Max. :30.000 Max. :30.000 Max. :35824 Max. :98.08
## Generation
## Length:29120
## Class :character
## Mode :character
##
##
##
df## # A tibble: 29,120 × 13
## Congress Start_date Chamber State Party_code Name ID Birthday
## <dbl> <date> <chr> <chr> <dbl> <chr> <chr> <date>
## 1 82 1951-01-03 House ND 200 AANDAHL, Fred … A000… 1897-04-09
## 2 80 1947-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 3 81 1949-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 4 82 1951-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 5 83 1953-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 6 84 1955-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 7 85 1957-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 8 86 1959-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 9 87 1961-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 10 88 1963-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## # ℹ 29,110 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## # Age_days <dbl>, Age_years <dbl>, Generation <chr>
Silent generation in the Democrat party.
Silent_rep<- subset(df, Generation <"Silent" & Party_code=="200")
Silent_rep## # A tibble: 10,742 × 13
## Congress Start_date Chamber State Party_code Name ID Birthday
## <dbl> <date> <chr> <chr> <dbl> <chr> <chr> <date>
## 1 82 1951-01-03 House ND 200 AANDAHL, Fred … A000… 1897-04-09
## 2 93 1973-01-03 House SD 200 ABDNOR, James A000… 1923-02-13
## 3 94 1975-01-03 House SD 200 ABDNOR, James A000… 1923-02-13
## 4 95 1977-01-03 House SD 200 ABDNOR, James A000… 1923-02-13
## 5 96 1979-01-03 House SD 200 ABDNOR, James A000… 1923-02-13
## 6 97 1981-01-03 Senate SD 200 ABDNOR, James A000… 1923-02-13
## 7 98 1983-01-03 Senate SD 200 ABDNOR, James A000… 1923-02-13
## 8 99 1985-01-03 Senate SD 200 ABDNOR, James A000… 1923-02-13
## 9 83 1953-01-03 Senate NE 200 ABEL, Hazel He… A000… 1888-07-10
## 10 88 1963-01-03 House OH 200 ABELE, Homer E. A000… 1916-11-21
## # ℹ 10,732 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## # Age_days <dbl>, Age_years <dbl>, Generation <chr>
summary(Silent_rep)## Congress Start_date Chamber State
## Min. : 66.00 Min. :1919-03-04 Length:10742 Length:10742
## 1st Qu.: 76.00 1st Qu.:1939-01-03 Class :character Class :character
## Median : 87.00 Median :1961-01-03 Mode :character Mode :character
## Mean : 89.99 Mean :1967-01-10
## 3rd Qu.:107.00 3rd Qu.:2001-01-03
## Max. :118.00 Max. :2023-01-03
## Party_code Name ID Birthday
## Min. :200 Length:10742 Length:10742 Min. :1836-05-07
## 1st Qu.:200 Class :character Class :character 1st Qu.:1883-12-16
## Median :200 Mode :character Mode :character Median :1907-07-07
## Mean :200 Mean :1913-06-15
## 3rd Qu.:200 3rd Qu.:1950-08-29
## Max. :200 Max. :1995-08-01
## Number_congress Number_chamber Age_days Age_years
## Min. : 1.000 Min. : 1.000 Min. : 8644 Min. :23.67
## 1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.:16926 1st Qu.:46.34
## Median : 4.000 Median : 4.000 Median :19562 Median :53.56
## Mean : 4.776 Mean : 4.526 Mean :19567 Mean :53.57
## 3rd Qu.: 7.000 3rd Qu.: 6.000 3rd Qu.:22151 3rd Qu.:60.64
## Max. :25.000 Max. :25.000 Max. :35824 Max. :98.08
## Generation
## Length:10742
## Class :character
## Mode :character
##
##
##
Silent_demo<- subset(df, Generation <"Silent" & Party_code=="100")
Silent_demo## # A tibble: 12,669 × 13
## Congress Start_date Chamber State Party_code Name ID Birthday
## <dbl> <date> <chr> <chr> <dbl> <chr> <chr> <date>
## 1 80 1947-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 2 81 1949-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 3 82 1951-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 4 83 1953-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 5 84 1955-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 6 85 1957-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 7 86 1959-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 8 87 1961-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 9 88 1963-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## 10 89 1965-01-03 House VA 100 ABBITT, Watkin… A000… 1908-05-21
## # ℹ 12,659 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## # Age_days <dbl>, Age_years <dbl>, Generation <chr>
This is such an intriguing dataset to explore and analyze. If we have chance to keep working with this the same data for others project I think I would like to keep working with this data set for exploratory analysis and visualizations.