1. Read the data

library(dplyr)
library(data.table)
library(ggplot2)
data <- fread('P60007168-ALL.csv', header = FALSE)
## 
Read 0.0% of 2063097 rows
Read 16.0% of 2063097 rows
Read 25.7% of 2063097 rows
Read 35.4% of 2063097 rows
Read 48.0% of 2063097 rows
Read 63.0% of 2063097 rows
Read 76.1% of 2063097 rows
Read 80.9% of 2063097 rows
Read 2063097 rows and 19 (of 19) columns from 0.413 GB file in 00:00:14
data <- data[, 1:18, with=FALSE]
nameStr <- 
"cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,
contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,
contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,tran_id,election_tp"

nameStr <- strsplit(nameStr, ',')[[1]]
newName <- gsub('\n', '', nameStr)
names(data) <- newName

# change the column type
new_data <- data %>%
  mutate(contbr_employer = factor(contbr_employer), 
         contbr_occupation = factor(contbr_occupation),
         contbr_st = factor(contbr_st),
         contb_receipt_dt = as.Date(contb_receipt_dt, format='%d-%b-%y'))

2. Most significant contributors by Occupation

q1 <- new_data %>% 
  group_by(contbr_occupation) %>%
  filter(contb_receipt_amt > 0) %>%
  summarise(total_amount = sum(contb_receipt_amt), total_contbr = n(), avg_donate=total_amount/total_contbr) %>%
  arrange(desc(total_contbr))
q1[1:10, ]
## # A tibble: 10 Ă— 4
##    contbr_occupation total_amount total_contbr avg_donate
##               <fctr>        <dbl>        <int>      <dbl>
## 1       NOT EMPLOYED   25628469.6       567712   45.14343
## 2            RETIRED    4052546.4        76700   52.83633
## 3            TEACHER    1823146.9        50043   36.43161
## 4           ENGINEER    1628144.6        29752   54.72387
## 5  SOFTWARE ENGINEER    1822173.0        29517   61.73300
## 6           ATTORNEY    2050315.7        29184   70.25479
## 7          PROFESSOR    1405766.2        26096   53.86903
## 8      SELF EMPLOYED    1213079.1        23479   51.66656
## 9          PHYSICIAN    1924241.7        23257   82.73817
## 10        CONSULTANT     900961.4        17543   51.35732
p1 <- ggplot(q1[1:10,], aes(x=reorder(contbr_occupation, total_contbr), y=total_contbr)) + 
  geom_bar(stat = 'identity') + 
  coord_flip() + 
  labs(title = 'Most Significant Contributors by Occupation (Top 10)', 
       y = 'Number of Contributors',
       x = 'Occupation')
p1

3. Most significant contributors by Employer

q2 <- new_data %>% 
  group_by(contbr_employer) %>%
  filter(contb_receipt_amt > 0) %>%
  summarise(total_amount = sum(contb_receipt_amt), total_contbr = n(), avg_donate=total_amount/total_contbr) %>%
  arrange(desc(total_contbr))
q2[1:10, ]
## # A tibble: 10 Ă— 4
##          contbr_employer total_amount total_contbr avg_donate
##                   <fctr>        <dbl>        <int>      <dbl>
## 1                   NONE   15713103.0       429918   36.54907
## 2           NOT EMPLOYED   14380900.1       223208   64.42825
## 3                   SELF    5937895.4       139408   42.59365
## 4          SELF EMPLOYED    6792103.7       122804   55.30849
## 5                RETIRED     671612.3        16960   39.59978
## 6          SELF-EMPLOYED     719114.3        15533   46.29590
## 7                           3099513.5        15277  202.88758
## 8  INFORMATION REQUESTED    1967702.3         9323  211.05891
## 9                    N/A     417217.1         7564   55.15827
## 10                   MS.     166044.5         4902   33.87280
p2 <- ggplot(q2[1:10,], aes(x=reorder(contbr_employer, total_contbr), y=total_contbr)) + 
  geom_bar(stat = 'identity') + 
  coord_flip() + 
  labs(title = 'Most Significant Contributors by Employer (Top 10)', 
       y = 'Number of Contributors',
       x = 'Employer')
p2

4. Most significant contributors over State

q3 <- new_data %>% 
  group_by(contbr_st) %>%
  filter(contb_receipt_amt > 0) %>%
  summarise(total_amount = sum(contb_receipt_amt), total_contbr = n(), avg_donate=total_amount/total_contbr) %>%
  arrange(desc(total_contbr))
q3[1:10, ]
## # A tibble: 10 Ă— 4
##    contbr_st total_amount total_contbr avg_donate
##       <fctr>        <dbl>        <int>      <dbl>
## 1         CA     20679154       404069   51.17728
## 2         NY      8457321       173387   48.77713
## 3         WA      5744186       120828   47.54019
## 4         MA      7882637       113686   69.33692
## 5         FL      3483095        81950   42.50268
## 6         TX      3568233        79487   44.89077
## 7         IL      3453610        76015   45.43327
## 8         OR      3120169        72475   43.05165
## 9         PA      2526101        59255   42.63101
## 10        VT      2537746        55706   45.55607
p3 <- ggplot(q3[1:10,], aes(x=reorder(contbr_st, total_contbr), y=total_contbr)) + 
  geom_bar(stat = 'identity') + 
  coord_flip() + 
  labs(title = 'Most Significant Contributors by State (Top 10)', 
       y = 'Number of Contributors',
       x = 'State')
p3

5. Most significant contributors over time

q4 <- new_data %>%
  group_by(Year_Month = format(contb_receipt_dt, '%y-%m')) %>%
  filter(contb_receipt_amt > 0) %>%
  summarise(total_amount = sum(contb_receipt_amt), total_contbr = n()) %>%
  arrange(Year_Month)

p4 <- ggplot(q4, aes(x=Year_Month, y=total_contbr)) + geom_line(group=1) +
  labs(title = 'Number of Contributors Over Time', 
       y = 'Number of Contributors',
       x = 'Year_Month')
p4