Libraries used

library(rio) #to import data
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
The following rio suggested packages are not installed: ‘csvy’, ‘feather’, ‘fst’, ‘hexView’, ‘readODS’, ‘rmatio’
Use 'install_formats()' to install them
library(tidyverse) #to tidy data
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.0     ✔ purrr   0.3.2
✔ tibble  2.1.3     ✔ dplyr   0.8.1
✔ tidyr   0.8.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(sf) # to read spatial datains
Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tmap) # to create maps
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
library(RColorBrewer) #to create choropleth color schemes

Raw data and raw map

#We read the original non-spatial csv data and the original election districts boundary shapefile for 2003-2012. The former is straight from Dropbox. The latter is obtained from the June 18 task (Dropbox shapefile manipulated into geojson for easier processing.)
#Read and display raw data from "japan_election_money.dta"
raw_data <- import("japan_election_money.dta")
raw_data
names(raw_data)
 [1] "chamber"           "year"              "prefecture"        "district_number"   "party"            
 [6] "party_english"     "name"              "age"               "male"              "occupation"       
[11] "inc"               "win"               "votes"             "SpendingLimit"     "Staff"            
[16] "Facilities"        "Telecommunication" "Transportation"    "Printing"          "Advertising"      
[21] "Stationery"        "Foods"             "Lodging"           "Miscelleneous"     "Spending"         
[26] "donation"          "otherincome"       "birthyear"         "cpi"               "samename"         
[31] "CandId"           
#Read and display 2003-2012 election boundaries
raw_map <- st_read("2003-2012 Japan Election Boundaries.geojson") %>% st_transform (2447) #EPSG is 2447 for Japan.
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2003-2012 Japan Election Boundaries.geojson' using driver `GeoJSON'
Simple feature collection with 300 features and 15 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
raw_map 
Simple feature collection with 300 features and 15 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: -1160363 ymin: -1284150 xmax: 930957.1 ymax: 1085939
epsg (SRID):    2447
proj4string:    +proj=tmerc +lat_0=36 +lon_0=134.3333333333333 +k=0.9999 +x_0=0 +y_0=0 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs
First 10 features:
   UserID   X...C.g.. X.o.... X.L.......h X...e X.摜.t...C X.....N.t.. X...l X.X.V.. X.....F X.h.. X...E...F X...E..
1     112  �k�C��12��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
2     202 �\u0090X2��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
3    2809     ����9��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
4    1303     ����3��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
5    3602     ����2��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
6    4010    ����10��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
7    4203     ����3��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
8    4602   ������2��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
9    4703     ����3��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
10   4701     ����1��    <NA>        <NA>  <NA>       <NA>        <NA>  <NA>    <NA>     128     7   4210816       1
      X.ʐ..m2. X.....m.                       geometry
1  14765470014  1213314 MULTIPOLYGON (((594442.4 10...
2   3533277611   438444 MULTIPOLYGON (((549273.6 63...
3    634745974   184994 MULTIPOLYGON (((61370.71 -1...
4    367897905   299564 MULTIPOLYGON (((465119.2 -1...
5   1446287890   311224 MULTIPOLYGON (((28669.2 -19...
6    282430080   106624 MULTIPOLYGON (((-305903.4 -...
7   1739268359   922765 MULTIPOLYGON (((-422568.9 -...
8   1642579328   753293 MULTIPOLYGON (((-422902.2 -...
9    940931618   402618 MULTIPOLYGON (((-626721.7 -...
10   176409935   163910 MULTIPOLYGON (((-700411.1 -...
tmap_mode("view")
tmap mode set to interactive viewing
tm_shape (raw_map) +
  tm_borders()

#map should read

Wrangling non-spatial election data

Manipulate 2003 non-spatial data to create sum of election spending for each prefecture-district, save processed data.

#use pipe to filter by year, then apply group_by to prefecture and district_id. Mutate to create PrefectureDistrictSpending, the total sum of spending in each prefecture-district. FInally, select the relevant variables and arrange by prefecture, district. The end result is called processed data, which is saved as a csv.
#2003
processed_data_2003 <- raw_data %>% filter (year == 2003) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2003 = sum(Spending)) %>% mutate("year" = 2003) %>% arrange(prefecture, district_number)
sum(is.na(processed_data_2003)) 
[1] 0
#check for "NAs". There are two types of NAs I encountered in the 4 years. First, NAs can appear explicitly in PrefectureDistrictSpending. We have to turn this type of NAs in election spending into missing implicits, or else choropleth intervals will NOT show these NAs correctly with a "missing" bracket. In 2003 data, no such NAs, so we go ahead. 
#The second type of NA is implicit. All districts in the csv could have election spending data. Some districts, though, are not present in the csv at all but are represented on the shapefile. These are also automatically categorized as "missing" when mapped onto the shapefile in choropleth mapping. So we don't need to worry about these NAs displaying incorrectly. 
#in this case for 2003 data, no explicit NAs, so we go ahead with the dataset. We check for NAs using sum(is.na(dataset)) 
processed_data_2003
#Save the processed data, identifying the year
write.csv (processed_data_2003, file = "processed_data_2003.csv")
raw_data %>% filter (year == 2003, prefecture == "北海道") %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2003 = sum(Spending)) %>% mutate("year" = 2003) %>% arrange(prefecture, district_number)

2005

#Same process as 2003 
processed_data_2005 <- raw_data %>% filter (year == 2005) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2005 = sum(Spending))  %>% mutate("year" = 2005) %>% arrange(prefecture, district_number)
 
sum(is.na(processed_data_2005)) #No explicit NAs, so we can go ahead. 
[1] 0
processed_data_2005
write.csv (processed_data_2005, file = "processed_data_2005.csv")

2009

#2009
processed_data_2009 <- raw_data %>% filter (year == 2009) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2009 = sum(Spending))  %>% mutate("year" = 2009) %>% arrange(prefecture, district_number)
processed_data_2009
sum(is.na(processed_data_2009)) #5 explicit NAs
[1] 5
sum(is.na(processed_data_2009$PrefectureDistrictSpending2009)) #Turns out, all 5 explicit NAs occur in PrefectureDistrictSpending2009, since this code and above both yield 5. 
[1] 5
processed_data_2009[is.na(processed_data_2009)] <- "" #turn explicit NAs implicit with empty "" to correctly generate "missing" bracket in choropleth
sum(is.na(processed_data_2009)) # no more explict NAs, we go ahead
[1] 0
processed_data_2009
write.csv (processed_data_2009, file = "processed_data_2009.csv")

2012

processed_data_2012 <- raw_data %>% filter (year == 2012) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2012 = sum(Spending))  %>% mutate("year" = 2012) %>% arrange(prefecture, district_number)
sum(is.na(processed_data_2012)) #no explicit NAs, go ahead
[1] 0
processed_data_2012
write.csv (processed_data_2012, file = "processed_data_2012.csv")

UserID and merging

#See June 21 task for how I merged the non-spatial and spatial data. First, type in UserID for prefecture-districts in non-spatial dataset. Then, merge by UserID on Geoda with the spatial data.
#read merged Geojsons. 
final_map_2003<- st_read("2003 Election and Spending Map.geojson") %>% st_transform(2447)
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2003 Election and Spending Map.geojson' using driver `GeoJSON'
Simple feature collection with 300 features and 7 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
names(final_map_2003)
[1] "UserID"                                                        
[2] "c.NA_character_..NA_character_..NA_character_..NA_character_.."
[3] "UserID_1"                                                      
[4] "prefecture"                                                    
[5] "district_number"                                               
[6] "PrefectureDistrictSpending2003"                                
[7] "year"                                                          
[8] "geometry"                                                      
final_map_2005<- st_read("2005 Election and Spending Map.geojson") %>% st_transform(2447)
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2005 Election and Spending Map.geojson' using driver `GeoJSON'
Simple feature collection with 300 features and 7 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
final_map_2009<- st_read("2009 Election and Spending Map.geojson") %>% st_transform(2447)
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2009 Election and Spending Map.geojson' using driver `GeoJSON'
Simple feature collection with 300 features and 7 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
final_map_2012<- st_read("2012 Election and Spending Map.geojson") %>% st_transform(2447)
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2012 Election and Spending Map.geojson' using driver `GeoJSON'
Simple feature collection with 300 features and 7 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
final_map_2014<- st_read("2014 Election and Spending Map.geojson") %>% rename ("PrefectureDistrictSpending2014" = PrefectureDistrictSpending) %>% st_transform(2447) 
Reading layer `OGRGeoJSON' from data source `/Users/calvinzhang/Desktop/SISRM Research Files/June 26 2003 2005 2009 2012 Election Spending/2014 Election and Spending Map.geojson' using driver `GeoJSON'
Simple feature collection with 295 features and 7 fields
geometry type:  MULTIPOLYGON
dimension:      XY
bbox:           xmin: 122.9382 ymin: 24.04738 xmax: 145.818 ymax: 45.52539
epsg (SRID):    4326
proj4string:    +proj=longlat +datum=WGS84 +no_defs
#In June 21 task for 2014 data, "PrefectureDistrictSpending" was named without year, so we add year here to the name with rename.
#Map
tm_basemap ("OpenStreetMap.Mapnik") +
tm_shape (final_map_2003) +
  tm_fill ("PrefectureDistrictSpending2003", style = "jenks", palette = "BuGn", alpha = 0.7) +
tm_shape (final_map_2005) +
  tm_fill ("PrefectureDistrictSpending2005", style = "jenks", palette = "RdPu", alpha = 0.7) +
tm_shape (final_map_2009) +
  tm_fill ("PrefectureDistrictSpending2009", style = "jenks", palette = "BuPu", alpha = 0.7) +
tm_shape (final_map_2012) +
  tm_fill ("PrefectureDistrictSpending2012", style = "jenks", palette = "GnBu", alpha = 0.7) +
tm_shape (final_map_2014) +
  tm_fill ("PrefectureDistrictSpending2014", style = "jenks", palette = "YlOrBr", alpha = 0.7)

NA
---
title: "R Notebook"
output: html_notebook
---

#Libraries used
```{r}
library(rio) #to import data
library(tidyverse) #to tidy data
library(sf) # to read spatial datains
library(tmap) # to create maps
library(RColorBrewer) #to create choropleth color schemes
```

#Raw data and raw map
```{r}
#We read the original non-spatial csv data and the original election districts boundary shapefile for 2003-2012. The former is straight from Dropbox. The latter is obtained from the June 18 task (Dropbox shapefile manipulated into geojson for easier processing.)

#Read and display raw data from "japan_election_money.dta"

raw_data <- import("japan_election_money.dta")
raw_data
names(raw_data)

#Read and display 2003-2012 election boundaries
raw_map <- st_read("2003-2012 Japan Election Boundaries.geojson") %>% st_transform (2447) #EPSG is 2447 for Japan.
raw_map 

tmap_mode("view")

tm_shape (raw_map) +
  tm_borders()

#map should read
```

##Wrangling non-spatial election data

#Manipulate 2003 non-spatial data to create sum of election spending for each prefecture-district, save processed data.
```{r}
#use pipe to filter by year, then apply group_by to prefecture and district_id. Mutate to create PrefectureDistrictSpending, the total sum of spending in each prefecture-district. FInally, select the relevant variables and arrange by prefecture, district. The end result is called processed data, which is saved as a csv.

#2003
processed_data_2003 <- raw_data %>% filter (year == 2003) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2003 = sum(Spending)) %>% mutate("year" = 2003) %>% arrange(prefecture, district_number)

sum(is.na(processed_data_2003)) 


#check for "NAs". There are two types of NAs I encountered in the 4 years. First, NAs can appear explicitly in PrefectureDistrictSpending. We have to turn this type of NAs in election spending into missing implicits, or else choropleth intervals will NOT show these NAs correctly with a "missing" bracket. In 2003 data, no such NAs, so we go ahead. 

#The second type of NA is implicit. All districts in the csv could have election spending data. Some districts, though, are not present in the csv at all but are represented on the shapefile. These are also automatically categorized as "missing" when mapped onto the shapefile in choropleth mapping. So we don't need to worry about these NAs displaying incorrectly. 

#in this case for 2003 data, no explicit NAs, so we go ahead with the dataset. We check for NAs using sum(is.na(dataset)) 

processed_data_2003

#Save the processed data, identifying the year
write.csv (processed_data_2003, file = "processed_data_2003.csv")

raw_data %>% filter (year == 2003, prefecture == "北海道") %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2003 = sum(Spending)) %>% mutate("year" = 2003) %>% arrange(prefecture, district_number)
```

#2005
```{r}
#Same process as 2003 

processed_data_2005 <- raw_data %>% filter (year == 2005) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2005 = sum(Spending))  %>% mutate("year" = 2005) %>% arrange(prefecture, district_number)
 
sum(is.na(processed_data_2005)) #No explicit NAs, so we can go ahead. 

processed_data_2005

write.csv (processed_data_2005, file = "processed_data_2005.csv")
```

#2009
```{r}
#2009
processed_data_2009 <- raw_data %>% filter (year == 2009) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2009 = sum(Spending))  %>% mutate("year" = 2009) %>% arrange(prefecture, district_number)

processed_data_2009

sum(is.na(processed_data_2009)) #5 explicit NAs

sum(is.na(processed_data_2009$PrefectureDistrictSpending2009)) #Turns out, all 5 explicit NAs occur in PrefectureDistrictSpending2009, since this code and above both yield 5. 

processed_data_2009[is.na(processed_data_2009)] <- "" #turn explicit NAs implicit with empty "" to correctly generate "missing" bracket in choropleth

sum(is.na(processed_data_2009)) # no more explict NAs, we go ahead

processed_data_2009

write.csv (processed_data_2009, file = "processed_data_2009.csv")
```

#2012
```{r}
processed_data_2012 <- raw_data %>% filter (year == 2012) %>% group_by(prefecture, district_number) %>% summarize(PrefectureDistrictSpending2012 = sum(Spending))  %>% mutate("year" = 2012) %>% arrange(prefecture, district_number)

sum(is.na(processed_data_2012)) #no explicit NAs, go ahead

processed_data_2012

write.csv (processed_data_2012, file = "processed_data_2012.csv")
```


#UserID and merging
```{r}
#See June 21 task for how I merged the non-spatial and spatial data. First, type in UserID for prefecture-districts in non-spatial dataset. Then, merge by UserID on Geoda with the spatial data.


#read merged Geojsons. 
final_map_2003<- st_read("2003 Election and Spending Map.geojson") %>% st_transform(2447)

names(final_map_2003)

final_map_2005<- st_read("2005 Election and Spending Map.geojson") %>% st_transform(2447)

final_map_2009<- st_read("2009 Election and Spending Map.geojson") %>% st_transform(2447)

final_map_2012<- st_read("2012 Election and Spending Map.geojson") %>% st_transform(2447)

final_map_2014<- st_read("2014 Election and Spending Map.geojson") %>% rename ("PrefectureDistrictSpending2014" = PrefectureDistrictSpending) %>% st_transform(2447) 

#In June 21 task for 2014 data, "PrefectureDistrictSpending" was named without year, so we add year here to the name with rename.

#Map
tm_basemap ("OpenStreetMap.Mapnik") +
tm_shape (final_map_2003) +
  tm_fill ("PrefectureDistrictSpending2003", style = "jenks", palette = "BuGn", alpha = 0.7) +
tm_shape (final_map_2005) +
  tm_fill ("PrefectureDistrictSpending2005", style = "jenks", palette = "RdPu", alpha = 0.7) +
tm_shape (final_map_2009) +
  tm_fill ("PrefectureDistrictSpending2009", style = "jenks", palette = "BuPu", alpha = 0.7) +
tm_shape (final_map_2012) +
  tm_fill ("PrefectureDistrictSpending2012", style = "jenks", palette = "GnBu", alpha = 0.7) +
tm_shape (final_map_2014) +
  tm_fill ("PrefectureDistrictSpending2014", style = "jenks", palette = "YlOrBr", alpha = 0.7)
  

```

