In this assignment, dataset and subset are explored. The data chosen was the 2022 FiveThirtyEight Election Forecastdatasets (senate only) on fivethirtyeight.com (link: https://github.com/fivethirtyeight/data/tree/master/election-forecasts-2022)
The first part of the assignment is to read the data through its raw code for reproducibility. Here the dataโs raw code was extracted from a GitHub repository. In the original dataset a column name called cycle was changed to 2022_cycle and a subset of data was taken from when the mean number of senate seats held by democrats was more than 50. This is the threshold of control over the senate.
library(RCurl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df = read.csv('https://projects.fivethirtyeight.com/2022-general-election-forecast-data/senate_national_toplines_2022.csv')
You can also embed plots, for example:
head(df)
## cycle branch expression forecastdate chamber_Dparty chamber_Rparty
## 1 2022 Senate _lite 9/4/22 0.809275 0.190725
## 2 2022 Senate _lite 9/3/22 0.809350 0.190650
## 3 2022 Senate _lite 9/2/22 0.808150 0.191850
## 4 2022 Senate _lite 9/1/22 0.816675 0.183325
## 5 2022 Senate _lite 8/31/22 0.820700 0.179300
## 6 2022 Senate _lite 8/30/22 0.816200 0.183800
## mean_seats_Dparty mean_seats_Rparty median_seats_Dparty median_seats_Rparty
## 1 52.21727 47.78273 52 48
## 2 52.23695 47.76305 52 48
## 3 52.23188 47.76812 52 48
## 4 52.27820 47.72180 52 48
## 5 52.33775 47.66225 53 47
## 6 52.19893 47.80107 52 48
## p90_seats_Dparty p90_seats_Rparty p10_seats_Dparty p10_seats_Rparty
## 1 56 52 48 44
## 2 56 52 48 44
## 3 56 52 48 44
## 4 56 52 48 44
## 5 56 52 48 44
## 6 56 52 48 44
## total_national_turnout p90_total_national_turnout p10_total_national_turnout
## 1 75200000 82100000 68100000
## 2 75200000 82300000 68100000
## 3 75200000 82200000 68200000
## 4 75200000 82300000 68200000
## 5 75200000 82300000 68000000
## 6 75200000 82200000 68100000
## popvote_margin p90_popvote_margin p10_popvote_margin simulations
## 1 3.650455 9.698673 -2.447037 40000
## 2 3.641415 9.693645 -2.420063 40000
## 3 3.632961 9.744553 -2.511457 40000
## 4 3.818428 10.020300 -2.280624 40000
## 5 3.968330 10.019780 -2.129215 40000
## 6 3.608364 9.662073 -2.554626 40000
## timestamp
## 1 11:54:52 4 Sep 2022
## 2 20:12:01 3 Sep 2022
## 3 20:12:01 2 Sep 2022
## 4 21:03:10 1 Sep 2022
## 5 20:33:38 31 Aug 2022
## 6 21:45:07 30 Aug 2022
names(df)[names(df)=="cycle"] <- "2022_cycle"
names(df)
## [1] "2022_cycle" "branch"
## [3] "expression" "forecastdate"
## [5] "chamber_Dparty" "chamber_Rparty"
## [7] "mean_seats_Dparty" "mean_seats_Rparty"
## [9] "median_seats_Dparty" "median_seats_Rparty"
## [11] "p90_seats_Dparty" "p90_seats_Rparty"
## [13] "p10_seats_Dparty" "p10_seats_Rparty"
## [15] "total_national_turnout" "p90_total_national_turnout"
## [17] "p10_total_national_turnout" "popvote_margin"
## [19] "p90_popvote_margin" "p10_popvote_margin"
## [21] "simulations" "timestamp"
Question for Subset Data: What date does the mean of the democratic party exceed 50? Why is this data interesting?
df_col_exclude =
df %>% select(-branch, -expression, -chamber_Dparty, -chamber_Rparty,
-p10_seats_Dparty, -p10_seats_Rparty)
D_party_control_senate <- subset(df_col_exclude, mean_seats_Dparty > 50)
head(D_party_control_senate %>% arrange(forecastdate))
## 2022_cycle forecastdate mean_seats_Dparty mean_seats_Rparty
## 1 2022 6/24/22 50.11635 49.88365
## 2 2022 6/25/22 50.08985 49.91015
## 3 2022 6/26/22 50.11153 49.88847
## 4 2022 6/29/22 50.39485 49.60515
## 5 2022 6/29/22 50.05315 49.94685
## 6 2022 6/30/22 50.09678 49.90322
## median_seats_Dparty median_seats_Rparty p90_seats_Dparty p90_seats_Rparty
## 1 50 50 55 54
## 2 50 50 55 55
## 3 50 50 55 54
## 4 51 49 55 54
## 5 50 50 54 54
## 6 50 50 55 55
## total_national_turnout p90_total_national_turnout p10_total_national_turnout
## 1 75800000 82900000 68800000
## 2 75800000 82900000 68900000
## 3 75800000 82900000 68800000
## 4 76200000 83300000 69200000
## 5 76200000 83300000 69200000
## 6 76200000 83200000 69300000
## popvote_margin p90_popvote_margin p10_popvote_margin simulations
## 1 -0.0498199 6.879044 -6.969931 40000
## 2 -0.0489426 6.788134 -7.004936 40000
## 3 -0.0482864 6.766439 -6.823490 40000
## 4 -0.0943642 6.716415 -6.976042 40000
## 5 0.5278893 7.104412 -6.092260 40000
## 6 -0.4509048 6.391006 -7.191931 40000
## timestamp
## 1 11:37:32 29 Jun 2022
## 2 12:03:45 29 Jun 2022
## 3 11:19:59 29 Jun 2022
## 4 22:11:20 29 Jun 2022
## 5 22:11:20 29 Jun 2022
## 6 20:51:20 30 Jun 2022