getwd() #see where R is pulling data from
## [1] "C:/Users/soumi/OneDrive/Documents"
setwd("~/Desktop/PLSC 309/ProblemSets") #tell R where to read data from
mydata <- read.csv("StateTurnout2020.csv") #the data set
nrow(mydata) #Number of observations (rows)
## [1] 51
ncol(mydata) #Number of Variables
## [1] 35
head(mydata) #First 6 rows of data
## State Ballots_cast VEP_pop VAP_pop Registered_total
## 1 ALABAMA 2323282 3761001.3 3917288 2527
## 2 ALASKA 359530 528179.7 553820 383
## 3 ARIZONA 3387054 4984557.4 5574070 3878
## 4 ARKANSAS 1219069 2166786.5 2315001 1361
## 5 CALIFORNIA 17500881 25917882.1 30730598 18001
## 6 COLORADO 3256980 4246894.6 4525319 2993
## Reported_votes_total VEP_pop_18_24 VEP_pop_25_34 VEP_pop_35_44 VEP_pop_45_64
## 1 2247 427 587 608 1246
## 2 330 59 94 99 174
## 3 3649 629 850 756 1627
## 4 1186 261 389 319 680
## 5 16893 3334 4957 4016 7973
## 6 2837 481 942 629 1302
## VEP_pop_65_plus VEP_pop_Female VEP_pop_Latino VEP_pop_Male VEP_pop_Total
## 1 848 1960 53 1755 3716
## 2 91 253 27 264 516
## 3 1213 2610 1340 2465 5075
## 4 547 1138 83 1057 2195
## 5 5666 13366 8305 12580 25946
## 6 845 2124 618 2076 4200
## VEP_pop_White Reported_votes_18_24 Reported_votes_25_34 Reported_votes_35_44
## 1 2569 194 315 365
## 2 323 29 41 62
## 3 3096 327 579 509
## 4 1733 84 176 179
## 5 11685 1786 2976 2554
## 6 3220 257 608 420
## Reported_votes_45_64 Reported_votes_65_plus Reported_votes_female
## 1 824 549 1209
## 2 131 67 165
## 3 1242 992 1996
## 4 401 346 640
## 5 5534 4044 8882
## 6 922 631 1482
## Reported_votes_latino Reported_votes_male Reported_votes_white VAP_pop_18_24
## 1 30 1038 1617 431
## 2 17 165 230 60
## 3 814 1653 2385 653
## 4 29 546 988 263
## 5 4539 8012 8711 3685
## 6 315 1355 2316 528
## VAP_pop_25_34 VAP_pop_35_44 VAP_pop_45_64 VAP_pop_65_plus VAP_pop_Female
## 1 599 626 1261 852 1990
## 2 97 102 177 92 259
## 3 941 929 1833 1282 2899
## 4 420 341 706 553 1182
## 5 5953 5090 9499 6117 15556
## 6 996 717 1432 852 2271
## VAP_pop_Latino VAP_pop_Male VAP_pop_census VAP_pop_White
## 1 79 1780 3769 2587
## 2 28 269 528 325
## 3 1800 2739 5638 3140
## 4 134 1101 2283 1744
## 5 11165 14786 30342 12090
## 6 854 2254 4525 3267
The first number, 51 corresponds to the number of observations or rows in the data set. The second number, 35 corresponds to the number of variables or columns in the data set.
colSums(is.na(mydata)) #Number of missing values for each column
## State Ballots_cast VEP_pop
## 0 0 0
## VAP_pop Registered_total Reported_votes_total
## 0 0 0
## VEP_pop_18_24 VEP_pop_25_34 VEP_pop_35_44
## 0 0 0
## VEP_pop_45_64 VEP_pop_65_plus VEP_pop_Female
## 0 0 0
## VEP_pop_Latino VEP_pop_Male VEP_pop_Total
## 0 0 0
## VEP_pop_White Reported_votes_18_24 Reported_votes_25_34
## 0 0 0
## Reported_votes_35_44 Reported_votes_45_64 Reported_votes_65_plus
## 0 0 0
## Reported_votes_female Reported_votes_latino Reported_votes_male
## 0 0 0
## Reported_votes_white VAP_pop_18_24 VAP_pop_25_34
## 0 0 0
## VAP_pop_35_44 VAP_pop_45_64 VAP_pop_65_plus
## 0 0 0
## VAP_pop_Female VAP_pop_Latino VAP_pop_Male
## 0 0 0
## VAP_pop_census VAP_pop_White
## 0 0
mydata$VEP.turnout <- c(mydata$Ballots_cast/mydata$VEP_pop)
print (mydata$VEP.turnout) #The voting eligible turnout rates from the administrative data
## [1] 0.6177296 0.6806963 0.6795095 0.5626161 0.6752435 0.7669086 0.6930230
## [8] 0.6946968 0.6539150 0.7184420 0.6789074 0.5518227 0.6682436 0.6595314
## [15] 0.6083741 0.7248151 0.6559589 0.6437515 0.6357299 0.7494134 0.6920059
## [22] 0.7048978 0.7300178 0.7937856 0.5986450 0.6597651 0.7203099 0.6877055
## [29] 0.6603633 0.7404666 0.7061192 0.6075056 0.6058103 0.7251225 0.6297917
## [36] 0.6645301 0.5516146 0.7497555 0.6966064 0.6256171 0.6540035 0.6562631
## [43] 0.5951490 0.6063745 0.6795739 0.7155281 0.7176365 0.7561560 0.5666433
## [50] 0.7503134 0.6493084
mydata$VAP.turnout <- c(mydata$Ballots_cast/mydata$VAP_pop)
print(mydata$VAP.turnout) #The voting age turnout rates from the administrative data.
## [1] 0.5930843 0.6491820 0.6076447 0.5265955 0.5694937 0.7197238 0.6354156
## [8] 0.6411542 0.6074273 0.6367952 0.6071643 0.4991455 0.6239323 0.6055255
## [15] 0.5833352 0.6896270 0.6160640 0.6128770 0.6029316 0.7363830 0.6311846
## [22] 0.6424129 0.7002890 0.7466368 0.5782625 0.6333446 0.7081734 0.6473529
## [29] 0.5802934 0.7162715 0.6256536 0.5634046 0.5388685 0.6753779 0.6088492
## [36] 0.6438075 0.5177236 0.7036222 0.6690085 0.5834684 0.6236186 0.6300328
## [43] 0.5657558 0.5162862 0.6355231 0.6990919 0.6609791 0.6773443 0.5554944
## [50] 0.7146876 0.6252005
mydata$VAP.census.turnout <- c((mydata$Reported_votes_total*1000)/mydata$VAP_pop)
print(mydata$VAP.census.turnout) #The voting age turnout rates based on the US Census survey data.
## [1] 0.5736111 0.5958615 0.6546384 0.5123108 0.5497127 0.6269171 0.5855097
## [8] 0.6216455 0.7902503 0.5592658 0.5935685 0.5473953 0.6059521 0.6079597
## [15] 0.5773500 0.6599064 0.5816396 0.6338817 0.5728808 0.6883419 0.6579884
## [22] 0.5747641 0.6313509 0.7347507 0.6738138 0.6258176 0.7097672 0.6037736
## [29] 0.5578410 0.7091612 0.6378448 0.5719627 0.5383769 0.5843296 0.6276640
## [36] 0.6661799 0.5410442 0.7118248 0.6535989 0.5803615 0.6101383 0.5665106
## [43] 0.6198793 0.5417898 0.5918441 0.6507109 0.6334873 0.6386302 0.5403588
## [50] 0.7049272 0.6325082
According to the data, the voting age turnout rates based on the US Census survey data were the lowest and the actual turnout rates were higher with voting eligible turnout rates being the highest
mydata$diff.VAP.VEP.admin <- c(mydata$VAP.turnout - mydata$VEP.turnout) #The difference between voting age and voting eligible turnout rates based on the administrative data
print(mydata$diff.VAP.VEP.admin)
## [1] -0.02464535 -0.03151428 -0.07186480 -0.03602065 -0.10574982 -0.04718476
## [7] -0.05760747 -0.05354257 -0.04648771 -0.08164681 -0.07174316 -0.05267723
## [13] -0.04431125 -0.05400597 -0.02503889 -0.03518811 -0.03989493 -0.03087449
## [19] -0.03279835 -0.01303041 -0.06082128 -0.06248494 -0.02972878 -0.04714883
## [25] -0.02038250 -0.02642051 -0.01213649 -0.04035262 -0.08006987 -0.02419515
## [31] -0.08046562 -0.04410097 -0.06694184 -0.04974460 -0.02094244 -0.02072264
## [37] -0.03389101 -0.04613335 -0.02759789 -0.04214872 -0.03038488 -0.02623028
## [43] -0.02939316 -0.09008836 -0.04405078 -0.01643627 -0.05665746 -0.07881170
## [49] -0.01114891 -0.03562584 -0.02410795
print(paste("Mean:" ,mean(mydata$diff.VAP.VEP.admin, na.rm = TRUE)))
## [1] "Mean: -0.0438273067203037"
print(paste("Range:" , range(mydata$diff.VAP.VEP.admin, na.rm = FALSE)))
## [1] "Range: -0.105749815454665" "Range: -0.0111489086043757"
mydata$diff.VAP.admin.census <- c(mydata$VAP.turnout - mydata$VAP.census.turnout) #The difference between voting age turnout based on the administrative data and the self-reported Census VAP turnout
print(mydata$diff.VAP.admin.census)
## [1] 0.0194731661 0.0533205735 -0.0469936689 0.0142846591 0.0197809688
## [6] 0.0928067171 0.0499059040 0.0195087348 -0.1828229928 0.0775294262
## [11] 0.0135957308 -0.0482498555 0.0179801739 -0.0024342474 0.0059852462
## [16] 0.0297206655 0.0344243673 -0.0210047168 0.0300508379 0.0480410561
## [21] -0.0268037788 0.0676487956 0.0689381055 0.0118861016 -0.0955512884
## [26] 0.0075269740 -0.0015937606 0.0435793214 0.0224523781 0.0071102973
## [31] -0.0121912528 -0.0085580990 0.0004915996 0.0910483276 -0.0188147752
## [36] -0.0223724702 -0.0233206316 -0.0082025802 0.0154095916 0.0031069060
## [41] 0.0134803587 0.0635222348 -0.0541234664 -0.0255036343 0.0436790326
## [46] 0.0483809285 0.0274917206 0.0387140168 0.0151356384 0.0097604136
## [51] -0.0073077288
print(paste("Mean:" , mean(mydata$diff.VAP.admin.census, na.rm = TRUE)))
## [1] "Mean: 0.0101945494435088"
print(paste("Range:" , range(mydata$diff.VAP.admin.census, na.rm = FALSE)))
## [1] "Range: -0.182822992755451" "Range: 0.0928067170513283"
The mean and range for the difference in voting age turnout rates and voting eligible turnout rates are negative meaning that VEP turnout rates are higher than VAP rates and that the turnout among eligible voters is higher than the general voting age population. A possible explanation for this could be that those who are eligible to vote are more engaged and interested in voting.
The mean for the difference between voting age turnout and self reported Census data is positive meaning that there is higher voting age turnout compared to the Census data in most states. I say most as the range for the difference is from -0.18 to +0.09. A possible explanation for these results could be level of accuracy as administrative data may be more precise and frequent than census data.
mydata$State[which.max(mydata$VAP.turnout)] #displays the state with the highest turnout rate
## [1] "MINNESOTA"
max(mydata$VAP.turnout, na.rm = TRUE) #displays the highest turnout rate.
## [1] 0.7466368
mydata$State[which.min(mydata$VAP.turnout)] #gives us the state with the lowest voter turnout rate
## [1] "HAWAII"
min(mydata$VAP.turnout, na.rm = TRUE) #displays the lowest turnout rate
## [1] 0.4991455
mydata$State[which.max(mydata$VAP.census.turnout)] #displays the state with the highest census turnout rate
## [1] "DISTRICT OF COLUMBIA"
max(mydata$VAP.census.turnout, na.rm = TRUE) #displays the highest census turnout rate.
## [1] 0.7902503
mydata$State[which.min(mydata$VAP.census.turnout)] #gives us the state with the lowest census turnout rate
## [1] "ARKANSAS"
min(mydata$VAP.census.turnout, na.rm = TRUE) #displays the lowest census turnout rate
## [1] 0.5123108
mydata$VAP_rate_18_24 <- mydata$Reported_votes_18_24/mydata$VAP_pop_18_24 #turnout rates for voting age adults age 18-24
print(paste("Mean of turnout for voting age adults 18-24:" , mean(mydata$VAP_rate_18_24, na.rm = TRUE)))
## [1] "Mean of turnout for voting age adults 18-24: 0.488670019433395"
mydata$VAP_Rate_25_34 <- mydata$Reported_votes_25_34/mydata$VAP_pop_25_34
print(paste("Mean of turnout for voting age adults 25-34:" , mean(mydata$VAP_Rate_25_34, na.rm = TRUE)))
## [1] "Mean of turnout for voting age adults 25-34: 0.547481975299716"
mydata$VAP_Rate_35_44 <- mydata$Reported_votes_35_44/mydata$VAP_pop_35_44
print(paste("Mean of turnout for voting age adults 35-44:" , mean(mydata$VAP_Rate_35_44, na.rm = TRUE)))
## [1] "Mean of turnout for voting age adults 35-44: 0.593840373296011"
mydata$VAP_Rate_45_64 <- mydata$Reported_votes_25_34/mydata$VAP_pop_45_64
print(paste("Mean of turnout for voting age adults 45-64:" , mean(mydata$VAP_Rate_45_64, na.rm = TRUE)))
## [1] "Mean of turnout for voting age adults 45-64: 0.3036716395508"
mydata$VAP_Rate_65_plus <- mydata$Reported_votes_65_plus/mydata$VAP_pop_65_plus
print(paste("Mean of turnout for voting age adults 65 plus:" , mean(mydata$VAP_Rate_65_plus, na.rm = TRUE)))
## [1] "Mean of turnout for voting age adults 65 plus: 0.734145682431118"
#Insert paragraph here
###Question 7
mydata$State[which.max(mydata$VAP_rate_18_24)] #gives us the state with the highest turnout rate for adults ages 18-24
## [1] "DISTRICT OF COLUMBIA"
max(mydata$VAP_rate_18_24) #displays the highest turnout rate for adults 18-24
## [1] 0.7460317
mydata$State[which.min(mydata$VAP_rate_18_24)] #gives us the state with the lowest turnout rate for adults ages 18-24
## [1] "OKLAHOMA"
min(mydata$VAP_rate_18_24) #displays the lowest turnout rate for adults 18-24
## [1] 0.2973761
PA_turnout <- mydata$VAP_rate_18_24[mydata$State == "PENNSYLVANIA"]
print(PA_turnout)
## [1] 0.4947569
FL_turnout <- mydata$VAP_rate_18_24[mydata$State == "FLORIDA"]
print(FL_turnout)
## [1] 0.4164524
NY_turnout <- mydata$VAP_rate_18_24[mydata$State == "NEW YORK"]
print(NY_turnout)
## [1] 0.432734
GA_turnout <- mydata$VAP_rate_18_24[mydata$State == "GEORGIA"]
print(GA_turnout)
## [1] 0.4939394
CA_turnout <- mydata$VAP_rate_18_24[mydata$State == "CALIFORNIA"]
print(CA_turnout)
## [1] 0.4846676
# This code was to find the turnout rates for ages 18-24 in the states of my choice
youth_turnout <- c(0.4947569, 0.4164524, 0.432734, 0.4939394, 0.4846676)
names(youth_turnout) <- c("PA","FL","NY","GA","CA")
print(youth_turnout)
## PA FL NY GA CA
## 0.4947569 0.4164524 0.4327340 0.4939394 0.4846676