This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(historydata)
library(ggplot2)

#cleaning up some of my own data on 1900 census
rawdata <- read.csv("historydata/1900Census.csv", stringsAsFactors = FALSE)
rawdata
##   Census.year Total.Population number.deaf ratio.of.population
## 1        1830         12896020        6106                 475
## 2        1840         17000453        7623                 449
## 3        1850         23191876        9863                 423
## 4        1860         31463321       12871                 408
## 5        1870         38558371       16205                 420
## 6        1880         56155783       33878                 675
## 7        1890         62622250       40592                 648
## 8        1900         75964575       24360                 321
censusTidy <- rawdata %>%
  tbl_df() %>%
  select(year = Census.year,
         respondants.total = Total.Population,
         respondants.deaf = number.deaf,
         - ratio.of.population) %>%
  gather(respondant, results, -year)
censusTidy
## Source: local data frame [16 x 3]
## 
##    year        respondant  results
## 1  1830 respondants.total 12896020
## 2  1840 respondants.total 17000453
## 3  1850 respondants.total 23191876
## 4  1860 respondants.total 31463321
## 5  1870 respondants.total 38558371
## 6  1880 respondants.total 56155783
## 7  1890 respondants.total 62622250
## 8  1900 respondants.total 75964575
## 9  1830  respondants.deaf     6106
## 10 1840  respondants.deaf     7623
## 11 1850  respondants.deaf     9863
## 12 1860  respondants.deaf    12871
## 13 1870  respondants.deaf    16205
## 14 1880  respondants.deaf    33878
## 15 1890  respondants.deaf    40592
## 16 1900  respondants.deaf    24360
# let's play with the data a bit - what if I need a specific year? or sample size?
oneYear <- censusTidy %>%
  filter (year == 1900)
oneYear
## Source: local data frame [2 x 3]
## 
##   year        respondant  results
## 1 1900 respondants.total 75964575
## 2 1900  respondants.deaf    24360
sample10000 <- censusTidy %>%
  filter (results >=10000)
sample10000
## Source: local data frame [13 x 3]
## 
##    year        respondant  results
## 1  1830 respondants.total 12896020
## 2  1840 respondants.total 17000453
## 3  1850 respondants.total 23191876
## 4  1860 respondants.total 31463321
## 5  1870 respondants.total 38558371
## 6  1880 respondants.total 56155783
## 7  1890 respondants.total 62622250
## 8  1900 respondants.total 75964575
## 9  1860  respondants.deaf    12871
## 10 1870  respondants.deaf    16205
## 11 1880  respondants.deaf    33878
## 12 1890  respondants.deaf    40592
## 13 1900  respondants.deaf    24360
# if I wanted to organize them by population size
popSize <- censusTidy %>%
  arrange(results)
popSize
## Source: local data frame [16 x 3]
## 
##    year        respondant  results
## 1  1830  respondants.deaf     6106
## 2  1840  respondants.deaf     7623
## 3  1850  respondants.deaf     9863
## 4  1860  respondants.deaf    12871
## 5  1870  respondants.deaf    16205
## 6  1900  respondants.deaf    24360
## 7  1880  respondants.deaf    33878
## 8  1890  respondants.deaf    40592
## 9  1830 respondants.total 12896020
## 10 1840 respondants.total 17000453
## 11 1850 respondants.total 23191876
## 12 1860 respondants.total 31463321
## 13 1870 respondants.total 38558371
## 14 1880 respondants.total 56155783
## 15 1890 respondants.total 62622250
## 16 1900 respondants.total 75964575
# if I want to get the number of the total pop in a given year, minus 
# the deaf population ( the hearing population)- I can compare with rawdata 
# and use mutate to compare the columns and create a new one

censusCompare <- rawdata %>%
  select(year = Census.year,
         totalpop = Total.Population,
         deafpop = number.deaf,
         -ratio.of.population)
censusCompare
##   year totalpop deafpop
## 1 1830 12896020    6106
## 2 1840 17000453    7623
## 3 1850 23191876    9863
## 4 1860 31463321   12871
## 5 1870 38558371   16205
## 6 1880 56155783   33878
## 7 1890 62622250   40592
## 8 1900 75964575   24360
hearingPop <- censusCompare %>%
  mutate(hearingpop = totalpop - deafpop)
hearingPop
##   year totalpop deafpop hearingpop
## 1 1830 12896020    6106   12889914
## 2 1840 17000453    7623   16992830
## 3 1850 23191876    9863   23182013
## 4 1860 31463321   12871   31450450
## 5 1870 38558371   16205   38542166
## 6 1880 56155783   33878   56121905
## 7 1890 62622250   40592   62581658
## 8 1900 75964575   24360   75940215
# or I can use censusTidy and use spread() to extract the different pops
# from the respondants column and compare it using mutate

hearingPop2 <- censusTidy %>%
  spread(respondant, results) %>%
  select(year,
         totalpop = respondants.total,
         deafpop = respondants.deaf) %>%
  mutate(hearingpop2 = totalpop - deafpop)
hearingPop2
## Source: local data frame [8 x 4]
## 
##   year totalpop deafpop hearingpop2
## 1 1830 12896020    6106    12889914
## 2 1840 17000453    7623    16992830
## 3 1850 23191876    9863    23182013
## 4 1860 31463321   12871    31450450
## 5 1870 38558371   16205    38542166
## 6 1880 56155783   33878    56121905
## 7 1890 62622250   40592    62581658
## 8 1900 75964575   24360    75940215
# or if I want to add up the total number of deaf respondants
# though this wouldn't be any indication of population size as a number of
# respondants would be potentially be counted a few times- but let's see what
# happens anyway

addDeaf <-hearingPop2 %>%
  summarize(totaldeaf = sum(deafpop))
addDeaf
## Source: local data frame [1 x 1]
## 
##   totaldeaf
## 1    151498
# Ignore the failures below!
#Got it to work, but now ratio is in the same column as respondants and
# causing problems. Moving it around.

#censusTidy <- censusTidy %>%
 # tbl_df() %>%
  #select(Census.year,
   #      respondants.total = Total.Population,
    #     respondants.deaf = number.deaf,
     #    ratio = ratio.of.population) %>%
#  gather(respondant, results, ratio, -Census.year)
#censusTidy

#Nope, now respondants are split in columns
# I'd rather just remove the ratio column altogether
# Just removing the ratio options doesn't work

#censusTidy <- censusTidy %>%
 # tbl_df() %>%
  #select(Census.year,
   #      respondants.total = Total.Population,
    #     respondants.deaf = number.deaf,
     #   - ratio.of.population) %>%
 # gather(respondant, results, -Census.year)
#censusTidy