The NY Times has a recent article on federal Research and Development spending by agency.Data comes directly from the American Association for the Advancement of Science Historical Trends. In this Assignment we will break down the spending by political parties in the US for the periods supplied to establish patterns in Democrat and Republican Party spending for future Governments. Predictive Analytics will be run through the data and represented in Graphical Format.
##Democrats and Republic Split in spending
## # A tibble: 2 x 2
## party_in_power gcc_spending
## <chr> <dbl>
## 1 Democrat 22773.
## 2 Republican 20882.
## # Bootstrap sampling
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [126/43]> Bootstrap01
## 2 <split [126/46]> Bootstrap02
## 3 <split [126/43]> Bootstrap03
## 4 <split [126/47]> Bootstrap04
## 5 <split [126/41]> Bootstrap05
## 6 <split [126/41]> Bootstrap06
## 7 <split [126/41]> Bootstrap07
## 8 <split [126/44]> Bootstrap08
## 9 <split [126/45]> Bootstrap09
## 10 <split [126/48]> Bootstrap10
## # … with 15 more rows
## ══ Workflow ══════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ──────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ─────────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 1000
##
## Computational engine: ranger
climate_res %>%
collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.903 25 0.0117
## 2 roc_auc binary 0.980 25 0.00494
## Truth
## Prediction Democrat Republican
## Democrat 512 55
## Republican 56 515
climate_res %>%
collect_predictions() %>%
group_by(id) %>%
ppv(party_in_power, .pred_class) %>%
ggplot(aes(.estimate)) +
geom_histogram(bins=10) +
ggtitle("Climate Spend Data Model Histogram")
##set importance
## # A tibble: 1,138 x 10
## id .pred_Democrat .pred_Republican .row .pred_class party_in_power
## <chr> <dbl> <dbl> <int> <fct> <fct>
## 1 Boot… 0.541 0.459 1 Democrat Democrat
## 2 Boot… 0.163 0.837 2 Republican Republican
## 3 Boot… 0.0548 0.945 3 Republican Republican
## 4 Boot… 0.0458 0.954 6 Republican Republican
## 5 Boot… 0.0520 0.948 22 Republican Republican
## 6 Boot… 0.0517 0.948 23 Republican Republican
## 7 Boot… 0.0551 0.945 24 Republican Republican
## 8 Boot… 0.124 0.876 25 Republican Republican
## 9 Boot… 0.213 0.787 27 Republican Republican
## 10 Boot… 0.857 0.143 29 Democrat Democrat
## # … with 1,128 more rows, and 4 more variables: correct <lgl>,
## # department <fct>, year <dbl>, gcc_spending <dbl>
Data Frame, Recipe and Predictive Model for Climate Spend
## # Bootstrap sampling
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [231/83]> Bootstrap01
## 2 <split [231/88]> Bootstrap02
## 3 <split [231/78]> Bootstrap03
## 4 <split [231/82]> Bootstrap04
## 5 <split [231/84]> Bootstrap05
## 6 <split [231/83]> Bootstrap06
## 7 <split [231/84]> Bootstrap07
## 8 <split [231/86]> Bootstrap08
## 9 <split [231/80]> Bootstrap09
## 10 <split [231/76]> Bootstrap10
## # … with 15 more rows
## ══ Workflow ══════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ──────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ─────────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 1000
##
## Computational engine: ranger
energy_res %>%
collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.968 25 0.00479
## 2 roc_auc binary 0.999 25 0.000361
energy_res %>%
collect_predictions() %>%
conf_mat(party_in_power1,.pred_class)
## Truth
## Prediction Democrat Republican
## Democrat 1325 55
## Republican 13 744
energy_res %>%
collect_predictions() %>%
group_by(id) %>%
ppv(party_in_power1, .pred_class) %>%
ggplot(aes(.estimate)) +
geom_histogram(bins=10) +
ggtitle("Energy Spend Data Model Histogram")
##set importance
energy_pred<- energy_res %>%
collect_predictions() %>%
mutate(correct = party_in_power1 == .pred_class) %>%
#count(correct)
left_join(energy_df %>%
mutate(.row = row_number()))
energy_pred
## # A tibble: 2,137 x 10
## id .pred_Democrat .pred_Republican .row .pred_class party_in_power1
## <chr> <dbl> <dbl> <int> <fct> <fct>
## 1 Boot… 0.851 0.149 5 Democrat Democrat
## 2 Boot… 0.503 0.497 6 Democrat Republican
## 3 Boot… 0.229 0.771 8 Republican Republican
## 4 Boot… 0.228 0.772 9 Republican Republican
## 5 Boot… 0.225 0.775 11 Republican Republican
## 6 Boot… 0.864 0.136 13 Democrat Democrat
## 7 Boot… 0.970 0.0304 17 Democrat Democrat
## 8 Boot… 0.768 0.232 22 Democrat Democrat
## 9 Boot… 0.648 0.352 24 Democrat Democrat
## 10 Boot… 0.618 0.382 26 Democrat Democrat
## # … with 2,127 more rows, and 4 more variables: correct <lgl>,
## # department <fct>, year <dbl>, energy_spending <dbl>
ggplot() +
geom_line(data = energy_spend,
aes(year, energy_spending, color = department)) +
stat_summary(data = energy_pred,
aes(year, energy_spending, z = as.integer(correct)),
fun = "mean",
alpha = 0.7) +
ggtitle("Energy Spend Data Predictive Model by Energy Spend per Year")+
xlab("Year") + # for the x axis label
ylab("Spending Amount $millions (US)")
## # Bootstrap sampling
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [588/220]> Bootstrap01
## 2 <split [588/223]> Bootstrap02
## 3 <split [588/225]> Bootstrap03
## 4 <split [588/206]> Bootstrap04
## 5 <split [588/220]> Bootstrap05
## 6 <split [588/214]> Bootstrap06
## 7 <split [588/220]> Bootstrap07
## 8 <split [588/223]> Bootstrap08
## 9 <split [588/213]> Bootstrap09
## 10 <split [588/224]> Bootstrap10
## # … with 15 more rows
## ══ Workflow ══════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ──────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ─────────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 1000
##
## Computational engine: ranger
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 1.00 25 0.000181
## 2 roc_auc binary 1 25 0
## Truth
## Prediction Republican Democrat
## Republican 2677 0
## Democrat 1 2721
##set importance
## # A tibble: 7,743 x 12
## id .pred_Republican .pred_Democrat .row .pred_class party_in_power2
## <chr> <dbl> <dbl> <int> <fct> <fct>
## 1 Boot… 0.938 0.0617 3 Republican Republican
## 2 Boot… 0.956 0.0439 4 Republican Republican
## 3 Boot… 0.958 0.0424 6 Republican Republican
## 4 Boot… 0.911 0.0891 7 Republican Republican
## 5 Boot… 0.969 0.0313 10 Republican Republican
## 6 Boot… 0.824 0.176 12 Republican Republican
## 7 Boot… 0.672 0.328 13 Republican Republican
## 8 Boot… 0.885 0.115 14 Republican Republican
## 9 Boot… 0.824 0.176 15 Republican Republican
## 10 Boot… 0.879 0.121 17 Republican Republican
## # … with 7,733 more rows, and 6 more variables: correct <lgl>,
## # department <chr>, year <dbl>, rd_budget <dbl>, gdp <dbl>, rd_gdp <dbl>
ggplot() +
geom_line(data = fed_rd_df,
aes(year, rd_gdp * 100 , color = department)) +
stat_summary(data = fed_rd_pred,
aes(year, rd_gdp * 100 , z = as.integer(correct)),
fun = "mean",
alpha = 0.7) +
ggtitle("Data Frame Prediction of Fed RD Budget by %GDP")+
xlab("Year") + # for the x axis label
ylab("% of GDP")
```