# I uploaded the libraries I might need.
library("pacman")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("dplyr")
library("ggplot2")
library("scales")
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# I uploaded the csv dataset to my github where I created a repository for the dataset. I then uploaded the dataset to RStudio.
theUrl <- "https://raw.githubusercontent.com/enidroman/data607_acquisition_and_managent/main/new-voter-registrations.csv"
voter_registration <- read.table(file=theUrl, header=TRUE, sep=",")
voter_registration
## Jurisdiction Year Month New.registered.voters
## 1 Arizona 2016 Jan 25852
## 2 Arizona 2016 Feb 51155
## 3 Arizona 2016 Mar 48614
## 4 Arizona 2016 Apr 30668
## 5 Arizona 2020 Jan 33229
## 6 Arizona 2020 Feb 50853
## 7 Arizona 2020 Mar 31872
## 8 Arizona 2020 Apr 10249
## 9 California 2016 Jan 87574
## 10 California 2016 Feb 103377
## 11 California 2016 Mar 174278
## 12 California 2016 Apr 185478
## 13 California 2020 Jan 151595
## 14 California 2020 Feb 238281
## 15 California 2020 Mar 176810
## 16 California 2020 Apr 38970
## 17 Colorado 2016 Jan 17024
## 18 Colorado 2016 Feb 20707
## 19 Colorado 2016 Mar 25627
## 20 Colorado 2016 Apr 22204
## 21 Colorado 2020 Jan 20260
## 22 Colorado 2020 Feb 33374
## 23 Colorado 2020 Mar 18990
## 24 Colorado 2020 Apr 6034
## 25 Delaware 2016 Jan 3007
## 26 Delaware 2016 Feb 3629
## 27 Delaware 2016 Mar 5124
## 28 Delaware 2016 Apr 3818
## 29 Delaware 2020 Jan 3276
## 30 Delaware 2020 Feb 3353
## 31 Delaware 2020 Mar 2535
## 32 Delaware 2020 Apr 589
## 33 District of Columbia 2016 Jan 2840
## 34 District of Columbia 2016 Feb 2954
## 35 District of Columbia 2016 Mar 4706
## 36 District of Columbia 2016 Apr 4157
## 37 District of Columbia 2016 May 5714
## 38 District of Columbia 2020 Jan 3334
## 39 District of Columbia 2020 Feb 3348
## 40 District of Columbia 2020 Mar 2225
## 41 District of Columbia 2020 Apr 1281
## 42 District of Columbia 2020 May 1925
## 43 Florida 2016 Jan 50231
## 44 Florida 2016 Feb 87351
## 45 Florida 2016 Mar 73627
## 46 Florida 2016 Apr 52508
## 47 Florida 2020 Jan 77466
## 48 Florida 2020 Feb 109859
## 49 Florida 2020 Mar 54872
## 50 Florida 2020 Apr 21031
## 51 Georgia 2016 Jan 34952
## 52 Georgia 2016 Feb 40976
## 53 Georgia 2016 Mar 44150
## 54 Georgia 2016 Apr 37028
## 55 Georgia 2020 Jan 38573
## 56 Georgia 2020 Feb 55386
## 57 Georgia 2020 Mar 26284
## 58 Georgia 2020 Apr 15484
## 59 Illinois 2016 Jan 44040
## 60 Illinois 2016 Feb 99674
## 61 Illinois 2016 Mar 52782
## 62 Illinois 2016 Apr 76098
## 63 Illinois 2020 Jan 44443
## 64 Illinois 2020 Feb 68455
## 65 Illinois 2020 Mar 47899
## 66 Illinois 2020 Apr 21332
## 67 Maryland 2016 Jan 19580
## 68 Maryland 2016 Feb 29122
## 69 Maryland 2016 Mar 40497
## 70 Maryland 2016 Apr 26655
## 71 Maryland 2016 May 5828
## 72 Maryland 2020 Jan 21532
## 73 Maryland 2020 Feb 20708
## 74 Maryland 2020 Mar 23864
## 75 Maryland 2020 Apr 10061
## 76 Maryland 2020 May 23488
## 77 North Carolina 2016 Jan 35213
## 78 North Carolina 2016 Feb 84357
## 79 North Carolina 2016 Mar 58272
## 80 North Carolina 2016 Apr 73341
## 81 North Carolina 2016 May 29374
## 82 North Carolina 2020 Jan 111990
## 83 North Carolina 2020 Feb 54053
## 84 North Carolina 2020 Mar 54807
## 85 North Carolina 2020 Apr 35484
## 86 North Carolina 2020 May 23517
## 87 Texas 2016 Jan 132860
## 88 Texas 2016 Feb 143795
## 89 Texas 2016 Mar 170607
## 90 Texas 2016 Apr 143199
## 91 Texas 2016 May 91205
## 92 Texas 2020 Jan 134559
## 93 Texas 2020 Feb 130080
## 94 Texas 2020 Mar 129424
## 95 Texas 2020 Apr 34694
## 96 Texas 2020 May 35678
## 97 Virginia 2016 Jan 20032
## 98 Virginia 2016 Feb 36911
## 99 Virginia 2016 Mar 44171
## 100 Virginia 2016 Apr 20460
## 101 Virginia 2016 May 26239
## 102 Virginia 2020 Jan 25934
## 103 Virginia 2020 Feb 29507
## 104 Virginia 2020 Mar 31492
## 105 Virginia 2020 Apr 5467
## 106 Virginia 2020 May 8239
# Checked the structure of the data with the function str():
str(voter_registration, vec.len = 1)
## 'data.frame': 106 obs. of 4 variables:
## $ Jurisdiction : chr "Arizona" ...
## $ Year : int 2016 2016 ...
## $ Month : chr "Jan" ...
## $ New.registered.voters: int 25852 51155 ...
# I changed "Year" from integer to a factor.
voter_registration$Year <- as.factor(voter_registration$Year)
str(voter_registration, vec.len = 1)
## 'data.frame': 106 obs. of 4 variables:
## $ Jurisdiction : chr "Arizona" ...
## $ Year : Factor w/ 2 levels "2016","2020": 1 1 ...
## $ Month : chr "Jan" ...
## $ New.registered.voters: int 25852 51155 ...
#I did a summary of the dataset.
summary(voter_registration)
## Jurisdiction Year Month New.registered.voters
## Length:106 2016:53 Length:106 Min. : 589
## Class :character 2020:53 Class :character 1st Qu.: 19138
## Mode :character Mode :character Median : 33302
## Mean : 48223
## 3rd Qu.: 55258
## Max. :238281
#I tried using the below code to remove the rows containing the month of May but I was not successful.
#voter_registration_to_remove <- subset(voter_registration, "Month" != "May")
#I then went through the dataset and note down the index # of the rows that contained the month of May and I was able to remove those rows by the index #.
voter_registration_to_remain <- voter_registration[-c(37, 42, 71, 76, 81, 86, 91, 96, 101, 106),]
voter_registration_to_remain
## Jurisdiction Year Month New.registered.voters
## 1 Arizona 2016 Jan 25852
## 2 Arizona 2016 Feb 51155
## 3 Arizona 2016 Mar 48614
## 4 Arizona 2016 Apr 30668
## 5 Arizona 2020 Jan 33229
## 6 Arizona 2020 Feb 50853
## 7 Arizona 2020 Mar 31872
## 8 Arizona 2020 Apr 10249
## 9 California 2016 Jan 87574
## 10 California 2016 Feb 103377
## 11 California 2016 Mar 174278
## 12 California 2016 Apr 185478
## 13 California 2020 Jan 151595
## 14 California 2020 Feb 238281
## 15 California 2020 Mar 176810
## 16 California 2020 Apr 38970
## 17 Colorado 2016 Jan 17024
## 18 Colorado 2016 Feb 20707
## 19 Colorado 2016 Mar 25627
## 20 Colorado 2016 Apr 22204
## 21 Colorado 2020 Jan 20260
## 22 Colorado 2020 Feb 33374
## 23 Colorado 2020 Mar 18990
## 24 Colorado 2020 Apr 6034
## 25 Delaware 2016 Jan 3007
## 26 Delaware 2016 Feb 3629
## 27 Delaware 2016 Mar 5124
## 28 Delaware 2016 Apr 3818
## 29 Delaware 2020 Jan 3276
## 30 Delaware 2020 Feb 3353
## 31 Delaware 2020 Mar 2535
## 32 Delaware 2020 Apr 589
## 33 District of Columbia 2016 Jan 2840
## 34 District of Columbia 2016 Feb 2954
## 35 District of Columbia 2016 Mar 4706
## 36 District of Columbia 2016 Apr 4157
## 38 District of Columbia 2020 Jan 3334
## 39 District of Columbia 2020 Feb 3348
## 40 District of Columbia 2020 Mar 2225
## 41 District of Columbia 2020 Apr 1281
## 43 Florida 2016 Jan 50231
## 44 Florida 2016 Feb 87351
## 45 Florida 2016 Mar 73627
## 46 Florida 2016 Apr 52508
## 47 Florida 2020 Jan 77466
## 48 Florida 2020 Feb 109859
## 49 Florida 2020 Mar 54872
## 50 Florida 2020 Apr 21031
## 51 Georgia 2016 Jan 34952
## 52 Georgia 2016 Feb 40976
## 53 Georgia 2016 Mar 44150
## 54 Georgia 2016 Apr 37028
## 55 Georgia 2020 Jan 38573
## 56 Georgia 2020 Feb 55386
## 57 Georgia 2020 Mar 26284
## 58 Georgia 2020 Apr 15484
## 59 Illinois 2016 Jan 44040
## 60 Illinois 2016 Feb 99674
## 61 Illinois 2016 Mar 52782
## 62 Illinois 2016 Apr 76098
## 63 Illinois 2020 Jan 44443
## 64 Illinois 2020 Feb 68455
## 65 Illinois 2020 Mar 47899
## 66 Illinois 2020 Apr 21332
## 67 Maryland 2016 Jan 19580
## 68 Maryland 2016 Feb 29122
## 69 Maryland 2016 Mar 40497
## 70 Maryland 2016 Apr 26655
## 72 Maryland 2020 Jan 21532
## 73 Maryland 2020 Feb 20708
## 74 Maryland 2020 Mar 23864
## 75 Maryland 2020 Apr 10061
## 77 North Carolina 2016 Jan 35213
## 78 North Carolina 2016 Feb 84357
## 79 North Carolina 2016 Mar 58272
## 80 North Carolina 2016 Apr 73341
## 82 North Carolina 2020 Jan 111990
## 83 North Carolina 2020 Feb 54053
## 84 North Carolina 2020 Mar 54807
## 85 North Carolina 2020 Apr 35484
## 87 Texas 2016 Jan 132860
## 88 Texas 2016 Feb 143795
## 89 Texas 2016 Mar 170607
## 90 Texas 2016 Apr 143199
## 92 Texas 2020 Jan 134559
## 93 Texas 2020 Feb 130080
## 94 Texas 2020 Mar 129424
## 95 Texas 2020 Apr 34694
## 97 Virginia 2016 Jan 20032
## 98 Virginia 2016 Feb 36911
## 99 Virginia 2016 Mar 44171
## 100 Virginia 2016 Apr 20460
## 102 Virginia 2020 Jan 25934
## 103 Virginia 2020 Feb 29507
## 104 Virginia 2020 Mar 31492
## 105 Virginia 2020 Apr 5467
#Since I removed the rows that contained the month of May, I reset the index.
voter_registration_new <- voter_registration_to_remain
rownames(voter_registration_new) <- 1:nrow(voter_registration_new)
voter_registration_new
## Jurisdiction Year Month New.registered.voters
## 1 Arizona 2016 Jan 25852
## 2 Arizona 2016 Feb 51155
## 3 Arizona 2016 Mar 48614
## 4 Arizona 2016 Apr 30668
## 5 Arizona 2020 Jan 33229
## 6 Arizona 2020 Feb 50853
## 7 Arizona 2020 Mar 31872
## 8 Arizona 2020 Apr 10249
## 9 California 2016 Jan 87574
## 10 California 2016 Feb 103377
## 11 California 2016 Mar 174278
## 12 California 2016 Apr 185478
## 13 California 2020 Jan 151595
## 14 California 2020 Feb 238281
## 15 California 2020 Mar 176810
## 16 California 2020 Apr 38970
## 17 Colorado 2016 Jan 17024
## 18 Colorado 2016 Feb 20707
## 19 Colorado 2016 Mar 25627
## 20 Colorado 2016 Apr 22204
## 21 Colorado 2020 Jan 20260
## 22 Colorado 2020 Feb 33374
## 23 Colorado 2020 Mar 18990
## 24 Colorado 2020 Apr 6034
## 25 Delaware 2016 Jan 3007
## 26 Delaware 2016 Feb 3629
## 27 Delaware 2016 Mar 5124
## 28 Delaware 2016 Apr 3818
## 29 Delaware 2020 Jan 3276
## 30 Delaware 2020 Feb 3353
## 31 Delaware 2020 Mar 2535
## 32 Delaware 2020 Apr 589
## 33 District of Columbia 2016 Jan 2840
## 34 District of Columbia 2016 Feb 2954
## 35 District of Columbia 2016 Mar 4706
## 36 District of Columbia 2016 Apr 4157
## 37 District of Columbia 2020 Jan 3334
## 38 District of Columbia 2020 Feb 3348
## 39 District of Columbia 2020 Mar 2225
## 40 District of Columbia 2020 Apr 1281
## 41 Florida 2016 Jan 50231
## 42 Florida 2016 Feb 87351
## 43 Florida 2016 Mar 73627
## 44 Florida 2016 Apr 52508
## 45 Florida 2020 Jan 77466
## 46 Florida 2020 Feb 109859
## 47 Florida 2020 Mar 54872
## 48 Florida 2020 Apr 21031
## 49 Georgia 2016 Jan 34952
## 50 Georgia 2016 Feb 40976
## 51 Georgia 2016 Mar 44150
## 52 Georgia 2016 Apr 37028
## 53 Georgia 2020 Jan 38573
## 54 Georgia 2020 Feb 55386
## 55 Georgia 2020 Mar 26284
## 56 Georgia 2020 Apr 15484
## 57 Illinois 2016 Jan 44040
## 58 Illinois 2016 Feb 99674
## 59 Illinois 2016 Mar 52782
## 60 Illinois 2016 Apr 76098
## 61 Illinois 2020 Jan 44443
## 62 Illinois 2020 Feb 68455
## 63 Illinois 2020 Mar 47899
## 64 Illinois 2020 Apr 21332
## 65 Maryland 2016 Jan 19580
## 66 Maryland 2016 Feb 29122
## 67 Maryland 2016 Mar 40497
## 68 Maryland 2016 Apr 26655
## 69 Maryland 2020 Jan 21532
## 70 Maryland 2020 Feb 20708
## 71 Maryland 2020 Mar 23864
## 72 Maryland 2020 Apr 10061
## 73 North Carolina 2016 Jan 35213
## 74 North Carolina 2016 Feb 84357
## 75 North Carolina 2016 Mar 58272
## 76 North Carolina 2016 Apr 73341
## 77 North Carolina 2020 Jan 111990
## 78 North Carolina 2020 Feb 54053
## 79 North Carolina 2020 Mar 54807
## 80 North Carolina 2020 Apr 35484
## 81 Texas 2016 Jan 132860
## 82 Texas 2016 Feb 143795
## 83 Texas 2016 Mar 170607
## 84 Texas 2016 Apr 143199
## 85 Texas 2020 Jan 134559
## 86 Texas 2020 Feb 130080
## 87 Texas 2020 Mar 129424
## 88 Texas 2020 Apr 34694
## 89 Virginia 2016 Jan 20032
## 90 Virginia 2016 Feb 36911
## 91 Virginia 2016 Mar 44171
## 92 Virginia 2016 Apr 20460
## 93 Virginia 2020 Jan 25934
## 94 Virginia 2020 Feb 29507
## 95 Virginia 2020 Mar 31492
## 96 Virginia 2020 Apr 5467
voter_registration_new_wide <- pivot_wider(voter_registration_new, names_from = Year, values_from = New.registered.voters)
voter_registration_new_wide
## # A tibble: 48 × 4
## Jurisdiction Month `2016` `2020`
## <chr> <chr> <int> <int>
## 1 Arizona Jan 25852 33229
## 2 Arizona Feb 51155 50853
## 3 Arizona Mar 48614 31872
## 4 Arizona Apr 30668 10249
## 5 California Jan 87574 151595
## 6 California Feb 103377 238281
## 7 California Mar 174278 176810
## 8 California Apr 185478 38970
## 9 Colorado Jan 17024 20260
## 10 Colorado Feb 20707 33374
## # … with 38 more rows
#I renamed the columns.
colnames(voter_registration_new_wide)[1:4] = c("STATE", "MONTH", "REGISTERED_VOTERS_2016", "REGISTERED_VOTERS_2020")
voter_registration_new_wide
## # A tibble: 48 × 4
## STATE MONTH REGISTERED_VOTERS_2016 REGISTERED_VOTERS_2020
## <chr> <chr> <int> <int>
## 1 Arizona Jan 25852 33229
## 2 Arizona Feb 51155 50853
## 3 Arizona Mar 48614 31872
## 4 Arizona Apr 30668 10249
## 5 California Jan 87574 151595
## 6 California Feb 103377 238281
## 7 California Mar 174278 176810
## 8 California Apr 185478 38970
## 9 Colorado Jan 17024 20260
## 10 Colorado Feb 20707 33374
## # … with 38 more rows
# I then created a column for the difference of Registgered Voters in 2016 and 2019.
voter_registration_new_wide$REGISTERED_VOTERS_DIFF <- (voter_registration_new_wide$REGISTERED_VOTERS_2016 - voter_registration_new_wide$REGISTERED_VOTERS_2020)
voter_registration_new_wide
## # A tibble: 48 × 5
## STATE MONTH REGISTERED_VOTERS_2016 REGISTERED_VOTERS_2020 REGISTERED_V…¹
## <chr> <chr> <int> <int> <int>
## 1 Arizona Jan 25852 33229 -7377
## 2 Arizona Feb 51155 50853 302
## 3 Arizona Mar 48614 31872 16742
## 4 Arizona Apr 30668 10249 20419
## 5 California Jan 87574 151595 -64021
## 6 California Feb 103377 238281 -134904
## 7 California Mar 174278 176810 -2532
## 8 California Apr 185478 38970 146508
## 9 Colorado Jan 17024 20260 -3236
## 10 Colorado Feb 20707 33374 -12667
## # … with 38 more rows, and abbreviated variable name ¹REGISTERED_VOTERS_DIFF
#I again did a new summary.
summary(voter_registration_new_wide)
## STATE MONTH REGISTERED_VOTERS_2016
## Length:48 Length:48 Min. : 2840
## Class :character Class :character 1st Qu.: 20645
## Mode :character Mode :character Median : 40737
## Mean : 54068
## 3rd Qu.: 74245
## Max. :185478
## REGISTERED_VOTERS_2020 REGISTERED_VOTERS_DIFF
## Min. : 589 Min. :-134904
## 1st Qu.: 14175 1st Qu.: -2097
## Median : 31682 Median : 3347
## Mean : 47192 Mean : 6877
## 3rd Qu.: 54823 3rd Qu.: 17023
## Max. :238281 Max. : 146508
# Viewing the unique values in the "Month" variable===
unique(voter_registration_new_wide$MONTH)
## [1] "Jan" "Feb" "Mar" "Apr"
# Identifying the class of the "Month" variable===
class(voter_registration_new_wide$MONTH)
## [1] "character"
# In order for the Months to be in sequential order in the X axes I had to convert the Month from Character to Factor and set the order.
voter_registration_new_wide <- voter_registration_new_wide %>%
mutate(MONTH = factor(MONTH, levels = c("Jan","Feb","Mar","Apr")))
# I did a ggplot geometric line to compare the Registered Voters for each of the 12 State for 2016.
# I did notice that I would need to use different colors that is not colour because District of Columbia and Florida are the same shade of green.
ggplot(voter_registration_new_wide, aes(x = MONTH, y = REGISTERED_VOTERS_2016, colour = STATE, group = STATE)) +
geom_line() +
geom_point()
# I did a ggplot geometric line to compare the Registered Voters for each of the 12 State for 2020
ggplot(voter_registration_new_wide, aes(x = MONTH, y = REGISTERED_VOTERS_2020, colour = STATE, group = STATE)) +
geom_line() +
geom_point()
# I did a ggplot geometric line to compare the Registered Voters difference for each of the 12 State for 2016 vs 2020
ggplot(voter_registration_new_wide, aes(x = MONTH, y = REGISTERED_VOTERS_DIFF, colour = STATE, group = STATE)) +
geom_line() +
geom_point()
# Total all 12 states for Jan, Feb, Mar, and April in 2016.
voter_registration_new_2016 <-voter_registration_new_wide %>%
group_by(MONTH) %>%
summarise(REGISTERED_VOTERS_2016 = sum(REGISTERED_VOTERS_2016))
voter_registration_new_2016
## # A tibble: 4 × 2
## MONTH REGISTERED_VOTERS_2016
## <fct> <int>
## 1 Jan 473205
## 2 Feb 704008
## 3 Mar 742455
## 4 Apr 675614
ggplot(data=voter_registration_new_2016, aes(x=MONTH, y=REGISTERED_VOTERS_2016, group=1)) +
geom_line()+
geom_point()
# Total all 12 states for Jan, Feb, Mar, and April in 2020.
voter_registration_new_2020 <-voter_registration_new_wide %>%
group_by(MONTH) %>%
summarise(REGISTERED_VOTERS_2020 = sum(REGISTERED_VOTERS_2020))
voter_registration_new_2020
## # A tibble: 4 × 2
## MONTH REGISTERED_VOTERS_2020
## <fct> <int>
## 1 Jan 666191
## 2 Feb 797257
## 3 Mar 601074
## 4 Apr 200676
ggplot(data=voter_registration_new_2020, aes(x=MONTH, y=REGISTERED_VOTERS_2020, group=1)) +
geom_line()+
geom_point()