Import data

# excel file
data <- read_excel("../00_data/MyData.xlsx")
data
## # A tibble: 32,452 × 20
##     year state   state_po state_fips state_cen state_ic office   district stage
##    <dbl> <chr>   <chr>         <dbl>     <dbl>    <dbl> <chr>       <dbl> <chr>
##  1  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  2  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  3  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  4  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  5  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  6  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  7  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
##  8  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
##  9  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
## 10  1976 ALABAMA AL                1        63       41 US HOUSE        4 GEN  
## # ℹ 32,442 more rows
## # ℹ 11 more variables: runoff <lgl>, special <lgl>, candidate <chr>,
## #   party <chr>, writein <lgl>, mode <chr>, candidatevotes <dbl>,
## #   totalvotes <dbl>, unofficial <lgl>, version <dbl>, fusion_ticket <lgl>

Filter rows

filter(data, runoff == "true"  )
## # A tibble: 0 × 20
## # ℹ 20 variables: year <dbl>, state <chr>, state_po <chr>, state_fips <dbl>,
## #   state_cen <dbl>, state_ic <dbl>, office <chr>, district <dbl>, stage <chr>,
## #   runoff <lgl>, special <lgl>, candidate <chr>, party <chr>, writein <lgl>,
## #   mode <chr>, candidatevotes <dbl>, totalvotes <dbl>, unofficial <lgl>,
## #   version <dbl>, fusion_ticket <lgl>

Arrange rows

arrange(data, desc(year) )
## # A tibble: 32,452 × 20
##     year state   state_po state_fips state_cen state_ic office   district stage
##    <dbl> <chr>   <chr>         <dbl>     <dbl>    <dbl> <chr>       <dbl> <chr>
##  1  2022 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  2  2022 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  3  2022 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  4  2022 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  5  2022 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  6  2022 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  7  2022 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  8  2022 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
##  9  2022 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
## 10  2022 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
## # ℹ 32,442 more rows
## # ℹ 11 more variables: runoff <lgl>, special <lgl>, candidate <chr>,
## #   party <chr>, writein <lgl>, mode <chr>, candidatevotes <dbl>,
## #   totalvotes <dbl>, unofficial <lgl>, version <dbl>, fusion_ticket <lgl>

Select columns

select(data,year,state,district, party, candidatevotes) 
## # A tibble: 32,452 × 5
##     year state   district party       candidatevotes
##    <dbl> <chr>      <dbl> <chr>                <dbl>
##  1  1976 ALABAMA        1 DEMOCRAT             58906
##  2  1976 ALABAMA        1 REPUBLICAN           98257
##  3  1976 ALABAMA        1 <NA>                     7
##  4  1976 ALABAMA        2 DEMOCRAT             66288
##  5  1976 ALABAMA        2 REPUBLICAN           90069
##  6  1976 ALABAMA        2 <NA>                     5
##  7  1976 ALABAMA        3 DEMOCRAT            106935
##  8  1976 ALABAMA        3 PROHIBITION           1111
##  9  1976 ALABAMA        3 <NA>                     2
## 10  1976 ALABAMA        4 REPUBLICAN           34531
## # ℹ 32,442 more rows

Add columns

mutate(data,candidatevotes_thousands = candidatevotes / 1000)
## # A tibble: 32,452 × 21
##     year state   state_po state_fips state_cen state_ic office   district stage
##    <dbl> <chr>   <chr>         <dbl>     <dbl>    <dbl> <chr>       <dbl> <chr>
##  1  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  2  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  3  1976 ALABAMA AL                1        63       41 US HOUSE        1 GEN  
##  4  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  5  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  6  1976 ALABAMA AL                1        63       41 US HOUSE        2 GEN  
##  7  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
##  8  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
##  9  1976 ALABAMA AL                1        63       41 US HOUSE        3 GEN  
## 10  1976 ALABAMA AL                1        63       41 US HOUSE        4 GEN  
## # ℹ 32,442 more rows
## # ℹ 12 more variables: runoff <lgl>, special <lgl>, candidate <chr>,
## #   party <chr>, writein <lgl>, mode <chr>, candidatevotes <dbl>,
## #   totalvotes <dbl>, unofficial <lgl>, version <dbl>, fusion_ticket <lgl>,
## #   candidatevotes_thousands <dbl>

Summarize by groups

summarise(data, votes_avg = mean(totalvotes, na.rm = TRUE))
## # A tibble: 1 × 1
##   votes_avg
##       <dbl>
## 1   215165.
data %>%
    
    # Group by Airlines
    group_by(year) %>%
    
    # Calculate Average Departure Delay
    summarise(votes_avg = mean(totalvotes, na.rm = TRUE))
## # A tibble: 24 × 2
##     year votes_avg
##    <dbl>     <dbl>
##  1  1976   173045.
##  2  1978   129435.
##  3  1980   180327.
##  4  1982   149305.
##  5  1984   196323.
##  6  1986   140428.
##  7  1988   193617.
##  8  1990   146097.
##  9  1992   229095.
## 10  1994   168899.
## # ℹ 14 more rows