library(openintro)
## Warning: package 'openintro' was built under R version 4.2.3

Introduction

Week 1 Assignment - Loading Data into a Data Frame.We are often tasked with taking data in one form and transforming it for easier downstream analysis. We will spend several weeks in this course on tidying and transformation operations.

Information about the data

This directory contains various demographic data about the United States Senate and House of Representatives over time. It’s been used in the following FiveThirtyEight articles:

Congress Today Is Older Than It’s Ever Been, by Geoffrey Skelley (April 3, 2023): https://fivethirtyeight.com/features/aging-congress-boomers/

Dataset found here: https://github.com/fivethirtyeight/data/blob/master/congress-demographics/data_aging_congress.csv

Loading the data

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library (readr)
## Warning: package 'readr' was built under R version 4.2.3
urlfile="https://raw.githubusercontent.com/fivethirtyeight/data/master/congress-demographics/data_aging_congress.csv"

mydata<-read_csv(url(urlfile))
## Rows: 29120 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): chamber, state_abbrev, bioname, bioguide_id, generation
## dbl  (6): congress, party_code, cmltv_cong, cmltv_chamber, age_days, age_years
## date (2): start_date, birthday
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(mydata)
## # A tibble: 6 × 13
##   congress start_date chamber state_abbrev party_code bioname        bioguide_id
##      <dbl> <date>     <chr>   <chr>             <dbl> <chr>          <chr>      
## 1       82 1951-01-03 House   ND                  200 AANDAHL, Fred… A000001    
## 2       80 1947-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 3       81 1949-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 4       82 1951-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 5       83 1953-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 6       84 1955-01-03 House   VA                  100 ABBITT, Watki… A000002    
## # ℹ 6 more variables: birthday <date>, cmltv_cong <dbl>, cmltv_chamber <dbl>,
## #   age_days <dbl>, age_years <dbl>, generation <chr>
str(mydata)
## spc_tbl_ [29,120 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ congress     : num [1:29120] 82 80 81 82 83 84 85 86 87 88 ...
##  $ start_date   : Date[1:29120], format: "1951-01-03" "1947-01-03" ...
##  $ chamber      : chr [1:29120] "House" "House" "House" "House" ...
##  $ state_abbrev : chr [1:29120] "ND" "VA" "VA" "VA" ...
##  $ party_code   : num [1:29120] 200 100 100 100 100 100 100 100 100 100 ...
##  $ bioname      : chr [1:29120] "AANDAHL, Fred George" "ABBITT, Watkins Moorman" "ABBITT, Watkins Moorman" "ABBITT, Watkins Moorman" ...
##  $ bioguide_id  : chr [1:29120] "A000001" "A000002" "A000002" "A000002" ...
##  $ birthday     : Date[1:29120], format: "1897-04-09" "1908-05-21" ...
##  $ cmltv_cong   : num [1:29120] 1 1 2 3 4 5 6 7 8 9 ...
##  $ cmltv_chamber: num [1:29120] 1 1 2 3 4 5 6 7 8 9 ...
##  $ age_days     : num [1:29120] 19626 14106 14837 15567 16298 ...
##  $ age_years    : num [1:29120] 53.7 38.6 40.6 42.6 44.6 ...
##  $ generation   : chr [1:29120] "Lost" "Greatest" "Greatest" "Greatest" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   congress = col_double(),
##   ..   start_date = col_date(format = ""),
##   ..   chamber = col_character(),
##   ..   state_abbrev = col_character(),
##   ..   party_code = col_double(),
##   ..   bioname = col_character(),
##   ..   bioguide_id = col_character(),
##   ..   birthday = col_date(format = ""),
##   ..   cmltv_cong = col_double(),
##   ..   cmltv_chamber = col_double(),
##   ..   age_days = col_double(),
##   ..   age_years = col_double(),
##   ..   generation = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Renaming columns and preparing dataset

I will rename columns for a more clear understanding and for the purposes of this exercise.

head(mydata)
## # A tibble: 6 × 13
##   congress start_date chamber state_abbrev party_code bioname        bioguide_id
##      <dbl> <date>     <chr>   <chr>             <dbl> <chr>          <chr>      
## 1       82 1951-01-03 House   ND                  200 AANDAHL, Fred… A000001    
## 2       80 1947-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 3       81 1949-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 4       82 1951-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 5       83 1953-01-03 House   VA                  100 ABBITT, Watki… A000002    
## 6       84 1955-01-03 House   VA                  100 ABBITT, Watki… A000002    
## # ℹ 6 more variables: birthday <date>, cmltv_cong <dbl>, cmltv_chamber <dbl>,
## #   age_days <dbl>, age_years <dbl>, generation <chr>
#Renaming columns
df<-mydata %>%
  dplyr::rename("Congress" = "congress",
         "Start_date" = "start_date",
         "Chamber" = "chamber",
         "State" = "state_abbrev",
         "Party_code" = "party_code",
         "Name" = "bioname",
         "ID" = "bioguide_id",
         "Birthday" = "birthday",
         "Number_congress" = "cmltv_cong",
         "Number_chamber" = "cmltv_chamber",
         "Age_days" = "age_days",
         "Age_years" = "age_years",
         "Generation" = "generation")

Looking at the congress statistics.

summary(df)
##     Congress        Start_date           Chamber             State          
##  Min.   : 66.00   Min.   :1919-03-04   Length:29120       Length:29120      
##  1st Qu.: 79.00   1st Qu.:1945-01-03   Class :character   Class :character  
##  Median : 92.00   Median :1971-01-03   Mode  :character   Mode  :character  
##  Mean   : 91.88   Mean   :1970-10-18                                        
##  3rd Qu.:105.00   3rd Qu.:1997-01-03                                        
##  Max.   :118.00   Max.   :2023-01-03                                        
##    Party_code        Name                ID               Birthday         
##  Min.   :100.0   Length:29120       Length:29120       Min.   :1835-06-10  
##  1st Qu.:100.0   Class :character   Class :character   1st Qu.:1891-12-21  
##  Median :100.0   Mode  :character   Mode  :character   Median :1918-11-22  
##  Mean   :146.7                                         Mean   :1917-01-24  
##  3rd Qu.:200.0                                         3rd Qu.:1943-05-16  
##  Max.   :537.0                                         Max.   :1997-01-17  
##  Number_congress  Number_chamber      Age_days       Age_years    
##  Min.   : 1.000   Min.   : 1.000   Min.   : 8644   Min.   :23.67  
##  1st Qu.: 2.000   1st Qu.: 2.000   1st Qu.:16732   1st Qu.:45.81  
##  Median : 4.000   Median : 4.000   Median :19523   Median :53.45  
##  Mean   : 5.414   Mean   : 5.112   Mean   :19626   Mean   :53.73  
##  3rd Qu.: 8.000   3rd Qu.: 7.000   3rd Qu.:22359   3rd Qu.:61.22  
##  Max.   :30.000   Max.   :30.000   Max.   :35824   Max.   :98.08  
##   Generation       
##  Length:29120      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
df
## # A tibble: 29,120 × 13
##    Congress Start_date Chamber State Party_code Name            ID    Birthday  
##       <dbl> <date>     <chr>   <chr>      <dbl> <chr>           <chr> <date>    
##  1       82 1951-01-03 House   ND           200 AANDAHL, Fred … A000… 1897-04-09
##  2       80 1947-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  3       81 1949-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  4       82 1951-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  5       83 1953-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  6       84 1955-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  7       85 1957-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  8       86 1959-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  9       87 1961-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
## 10       88 1963-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
## # ℹ 29,110 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## #   Age_days <dbl>, Age_years <dbl>, Generation <chr>

Subsetting by Generation

Silent generation in the Democrat party.

Silent_rep<- subset(df, Generation <"Silent" & Party_code=="200")
Silent_rep
## # A tibble: 10,742 × 13
##    Congress Start_date Chamber State Party_code Name            ID    Birthday  
##       <dbl> <date>     <chr>   <chr>      <dbl> <chr>           <chr> <date>    
##  1       82 1951-01-03 House   ND           200 AANDAHL, Fred … A000… 1897-04-09
##  2       93 1973-01-03 House   SD           200 ABDNOR, James   A000… 1923-02-13
##  3       94 1975-01-03 House   SD           200 ABDNOR, James   A000… 1923-02-13
##  4       95 1977-01-03 House   SD           200 ABDNOR, James   A000… 1923-02-13
##  5       96 1979-01-03 House   SD           200 ABDNOR, James   A000… 1923-02-13
##  6       97 1981-01-03 Senate  SD           200 ABDNOR, James   A000… 1923-02-13
##  7       98 1983-01-03 Senate  SD           200 ABDNOR, James   A000… 1923-02-13
##  8       99 1985-01-03 Senate  SD           200 ABDNOR, James   A000… 1923-02-13
##  9       83 1953-01-03 Senate  NE           200 ABEL, Hazel He… A000… 1888-07-10
## 10       88 1963-01-03 House   OH           200 ABELE, Homer E. A000… 1916-11-21
## # ℹ 10,732 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## #   Age_days <dbl>, Age_years <dbl>, Generation <chr>
summary(Silent_rep)
##     Congress        Start_date           Chamber             State          
##  Min.   : 66.00   Min.   :1919-03-04   Length:10742       Length:10742      
##  1st Qu.: 76.00   1st Qu.:1939-01-03   Class :character   Class :character  
##  Median : 87.00   Median :1961-01-03   Mode  :character   Mode  :character  
##  Mean   : 89.99   Mean   :1967-01-10                                        
##  3rd Qu.:107.00   3rd Qu.:2001-01-03                                        
##  Max.   :118.00   Max.   :2023-01-03                                        
##    Party_code      Name                ID               Birthday         
##  Min.   :200   Length:10742       Length:10742       Min.   :1836-05-07  
##  1st Qu.:200   Class :character   Class :character   1st Qu.:1883-12-16  
##  Median :200   Mode  :character   Mode  :character   Median :1907-07-07  
##  Mean   :200                                         Mean   :1913-06-15  
##  3rd Qu.:200                                         3rd Qu.:1950-08-29  
##  Max.   :200                                         Max.   :1995-08-01  
##  Number_congress  Number_chamber      Age_days       Age_years    
##  Min.   : 1.000   Min.   : 1.000   Min.   : 8644   Min.   :23.67  
##  1st Qu.: 2.000   1st Qu.: 2.000   1st Qu.:16926   1st Qu.:46.34  
##  Median : 4.000   Median : 4.000   Median :19562   Median :53.56  
##  Mean   : 4.776   Mean   : 4.526   Mean   :19567   Mean   :53.57  
##  3rd Qu.: 7.000   3rd Qu.: 6.000   3rd Qu.:22151   3rd Qu.:60.64  
##  Max.   :25.000   Max.   :25.000   Max.   :35824   Max.   :98.08  
##   Generation       
##  Length:10742      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
Silent_demo<- subset(df, Generation <"Silent" & Party_code=="100")
Silent_demo
## # A tibble: 12,669 × 13
##    Congress Start_date Chamber State Party_code Name            ID    Birthday  
##       <dbl> <date>     <chr>   <chr>      <dbl> <chr>           <chr> <date>    
##  1       80 1947-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  2       81 1949-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  3       82 1951-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  4       83 1953-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  5       84 1955-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  6       85 1957-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  7       86 1959-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  8       87 1961-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
##  9       88 1963-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
## 10       89 1965-01-03 House   VA           100 ABBITT, Watkin… A000… 1908-05-21
## # ℹ 12,659 more rows
## # ℹ 5 more variables: Number_congress <dbl>, Number_chamber <dbl>,
## #   Age_days <dbl>, Age_years <dbl>, Generation <chr>

Conclusion

This is such an intriguing dataset to explore and analyze. If we have chance to keep working with this the same data for others project I think I would like to keep working with this data set for exploratory analysis and visualizations.

LS0tDQp0aXRsZTogIkhvbWV3b3JrIDEiDQphdXRob3I6ICJMYXVyYSBQdWVibGEiDQpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCg0KYGBge3IgbG9hZC1wYWNrYWdlcywgbWVzc2FnZT1GQUxTRX0NCg0KbGlicmFyeShvcGVuaW50cm8pDQpgYGANCg0KIyMjIEludHJvZHVjdGlvbg0KDQpXZWVrIDEgQXNzaWdubWVudCAtIExvYWRpbmcgRGF0YSBpbnRvIGEgRGF0YSBGcmFtZS5XZSBhcmUgb2Z0ZW4gdGFza2VkIHdpdGggdGFraW5nIGRhdGEgaW4gb25lIGZvcm0gYW5kIHRyYW5zZm9ybWluZyBpdCBmb3IgZWFzaWVyIGRvd25zdHJlYW0gYW5hbHlzaXMuIFdlIHdpbGwNCnNwZW5kIHNldmVyYWwgd2Vla3MgaW4gdGhpcyBjb3Vyc2Ugb24gdGlkeWluZyBhbmQgdHJhbnNmb3JtYXRpb24gb3BlcmF0aW9ucy4gDQoNCiMjIyBJbmZvcm1hdGlvbiBhYm91dCB0aGUgZGF0YQ0KVGhpcyBkaXJlY3RvcnkgY29udGFpbnMgdmFyaW91cyBkZW1vZ3JhcGhpYyBkYXRhIGFib3V0IHRoZSBVbml0ZWQgU3RhdGVzIFNlbmF0ZSBhbmQgSG91c2Ugb2YgUmVwcmVzZW50YXRpdmVzIG92ZXIgdGltZS4gSXTigJlzIGJlZW4gdXNlZCBpbiB0aGUgZm9sbG93aW5nIEZpdmVUaGlydHlFaWdodCBhcnRpY2xlczoNCg0KQ29uZ3Jlc3MgVG9kYXkgSXMgT2xkZXIgVGhhbiBJdOKAmXMgRXZlciBCZWVuLCBieSBHZW9mZnJleSBTa2VsbGV5IChBcHJpbCAzLCAyMDIzKTogDQpodHRwczovL2ZpdmV0aGlydHllaWdodC5jb20vZmVhdHVyZXMvYWdpbmctY29uZ3Jlc3MtYm9vbWVycy8gDQoNCkRhdGFzZXQgZm91bmQgaGVyZToNCmh0dHBzOi8vZ2l0aHViLmNvbS9maXZldGhpcnR5ZWlnaHQvZGF0YS9ibG9iL21hc3Rlci9jb25ncmVzcy1kZW1vZ3JhcGhpY3MvZGF0YV9hZ2luZ19jb25ncmVzcy5jc3YNCg0KIyMjIExvYWRpbmcgdGhlIGRhdGENCg0KYGBge3J9DQoNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5IChyZWFkcikNCmBgYA0KDQoNCmBgYHtyfQ0KdXJsZmlsZT0iaHR0cHM6Ly9yYXcuZ2l0aHVidXNlcmNvbnRlbnQuY29tL2ZpdmV0aGlydHllaWdodC9kYXRhL21hc3Rlci9jb25ncmVzcy1kZW1vZ3JhcGhpY3MvZGF0YV9hZ2luZ19jb25ncmVzcy5jc3YiDQoNCm15ZGF0YTwtcmVhZF9jc3YodXJsKHVybGZpbGUpKQ0KYGBgDQoNCmBgYHtyfQ0KaGVhZChteWRhdGEpDQpgYGANCg0KYGBge3J9DQpzdHIobXlkYXRhKQ0KYGBgDQojIyMgUmVuYW1pbmcgY29sdW1ucyBhbmQgcHJlcGFyaW5nIGRhdGFzZXQNCkkgd2lsbCByZW5hbWUgY29sdW1ucyBmb3IgYSBtb3JlIGNsZWFyIHVuZGVyc3RhbmRpbmcgYW5kIGZvciB0aGUgcHVycG9zZXMgb2YgdGhpcyBleGVyY2lzZS4NCg0KYGBge3J9DQpoZWFkKG15ZGF0YSkNCmBgYA0KDQpgYGB7cn0NCiNSZW5hbWluZyBjb2x1bW5zDQpkZjwtbXlkYXRhICU+JQ0KICBkcGx5cjo6cmVuYW1lKCJDb25ncmVzcyIgPSAiY29uZ3Jlc3MiLA0KICAgICAgICAgIlN0YXJ0X2RhdGUiID0gInN0YXJ0X2RhdGUiLA0KICAgICAgICAgIkNoYW1iZXIiID0gImNoYW1iZXIiLA0KICAgICAgICAgIlN0YXRlIiA9ICJzdGF0ZV9hYmJyZXYiLA0KICAgICAgICAgIlBhcnR5X2NvZGUiID0gInBhcnR5X2NvZGUiLA0KICAgICAgICAgIk5hbWUiID0gImJpb25hbWUiLA0KICAgICAgICAgIklEIiA9ICJiaW9ndWlkZV9pZCIsDQogICAgICAgICAiQmlydGhkYXkiID0gImJpcnRoZGF5IiwNCiAgICAgICAgICJOdW1iZXJfY29uZ3Jlc3MiID0gImNtbHR2X2NvbmciLA0KICAgICAgICAgIk51bWJlcl9jaGFtYmVyIiA9ICJjbWx0dl9jaGFtYmVyIiwNCiAgICAgICAgICJBZ2VfZGF5cyIgPSAiYWdlX2RheXMiLA0KICAgICAgICAgIkFnZV95ZWFycyIgPSAiYWdlX3llYXJzIiwNCiAgICAgICAgICJHZW5lcmF0aW9uIiA9ICJnZW5lcmF0aW9uIikNCmBgYA0KDQoNCkxvb2tpbmcgYXQgdGhlIGNvbmdyZXNzIHN0YXRpc3RpY3MuDQpgYGB7cn0NCnN1bW1hcnkoZGYpDQpgYGANCmBgYHtyfQ0KZGYNCg0KYGBgDQoNCg0KIyMjIFN1YnNldHRpbmcgYnkgR2VuZXJhdGlvbg0KU2lsZW50IGdlbmVyYXRpb24gaW4gdGhlIERlbW9jcmF0IHBhcnR5Lg0KYGBge3J9DQpTaWxlbnRfcmVwPC0gc3Vic2V0KGRmLCBHZW5lcmF0aW9uIDwiU2lsZW50IiAmIFBhcnR5X2NvZGU9PSIyMDAiKQ0KU2lsZW50X3JlcA0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KFNpbGVudF9yZXApDQpgYGANCg0KYGBge3J9DQpTaWxlbnRfZGVtbzwtIHN1YnNldChkZiwgR2VuZXJhdGlvbiA8IlNpbGVudCIgJiBQYXJ0eV9jb2RlPT0iMTAwIikNClNpbGVudF9kZW1vDQpgYGANCiMjIyBDb25jbHVzaW9uDQpUaGlzIGlzIHN1Y2ggYW4gaW50cmlndWluZyBkYXRhc2V0IHRvIGV4cGxvcmUgYW5kIGFuYWx5emUuIElmIHdlIGhhdmUgY2hhbmNlIHRvIGtlZXAgd29ya2luZyB3aXRoIHRoaXMgdGhlIHNhbWUgZGF0YSBmb3Igb3RoZXJzIHByb2plY3QgSSB0aGluayBJIHdvdWxkIGxpa2UgdG8ga2VlcCB3b3JraW5nIHdpdGggdGhpcyBkYXRhIHNldCBmb3IgZXhwbG9yYXRvcnkgYW5hbHlzaXMgYW5kIHZpc3VhbGl6YXRpb25zLiANCg0KDQoNCg0KDQo=