This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(historydata)
library(ggplot2)
#cleaning up some of my own data on 1900 census
rawdata <- read.csv("historydata/1900Census.csv", stringsAsFactors = FALSE)
rawdata
## Census.year Total.Population number.deaf ratio.of.population
## 1 1830 12896020 6106 475
## 2 1840 17000453 7623 449
## 3 1850 23191876 9863 423
## 4 1860 31463321 12871 408
## 5 1870 38558371 16205 420
## 6 1880 56155783 33878 675
## 7 1890 62622250 40592 648
## 8 1900 75964575 24360 321
censusTidy <- rawdata %>%
tbl_df() %>%
select(year = Census.year,
respondants.total = Total.Population,
respondants.deaf = number.deaf,
- ratio.of.population) %>%
gather(respondant, results, -year)
censusTidy
## Source: local data frame [16 x 3]
##
## year respondant results
## 1 1830 respondants.total 12896020
## 2 1840 respondants.total 17000453
## 3 1850 respondants.total 23191876
## 4 1860 respondants.total 31463321
## 5 1870 respondants.total 38558371
## 6 1880 respondants.total 56155783
## 7 1890 respondants.total 62622250
## 8 1900 respondants.total 75964575
## 9 1830 respondants.deaf 6106
## 10 1840 respondants.deaf 7623
## 11 1850 respondants.deaf 9863
## 12 1860 respondants.deaf 12871
## 13 1870 respondants.deaf 16205
## 14 1880 respondants.deaf 33878
## 15 1890 respondants.deaf 40592
## 16 1900 respondants.deaf 24360
# let's play with the data a bit - what if I need a specific year? or sample size?
oneYear <- censusTidy %>%
filter (year == 1900)
oneYear
## Source: local data frame [2 x 3]
##
## year respondant results
## 1 1900 respondants.total 75964575
## 2 1900 respondants.deaf 24360
sample10000 <- censusTidy %>%
filter (results >=10000)
sample10000
## Source: local data frame [13 x 3]
##
## year respondant results
## 1 1830 respondants.total 12896020
## 2 1840 respondants.total 17000453
## 3 1850 respondants.total 23191876
## 4 1860 respondants.total 31463321
## 5 1870 respondants.total 38558371
## 6 1880 respondants.total 56155783
## 7 1890 respondants.total 62622250
## 8 1900 respondants.total 75964575
## 9 1860 respondants.deaf 12871
## 10 1870 respondants.deaf 16205
## 11 1880 respondants.deaf 33878
## 12 1890 respondants.deaf 40592
## 13 1900 respondants.deaf 24360
# if I wanted to organize them by population size
popSize <- censusTidy %>%
arrange(results)
popSize
## Source: local data frame [16 x 3]
##
## year respondant results
## 1 1830 respondants.deaf 6106
## 2 1840 respondants.deaf 7623
## 3 1850 respondants.deaf 9863
## 4 1860 respondants.deaf 12871
## 5 1870 respondants.deaf 16205
## 6 1900 respondants.deaf 24360
## 7 1880 respondants.deaf 33878
## 8 1890 respondants.deaf 40592
## 9 1830 respondants.total 12896020
## 10 1840 respondants.total 17000453
## 11 1850 respondants.total 23191876
## 12 1860 respondants.total 31463321
## 13 1870 respondants.total 38558371
## 14 1880 respondants.total 56155783
## 15 1890 respondants.total 62622250
## 16 1900 respondants.total 75964575
# if I want to get the number of the total pop in a given year, minus
# the deaf population ( the hearing population)- I can compare with rawdata
# and use mutate to compare the columns and create a new one
censusCompare <- rawdata %>%
select(year = Census.year,
totalpop = Total.Population,
deafpop = number.deaf,
-ratio.of.population)
censusCompare
## year totalpop deafpop
## 1 1830 12896020 6106
## 2 1840 17000453 7623
## 3 1850 23191876 9863
## 4 1860 31463321 12871
## 5 1870 38558371 16205
## 6 1880 56155783 33878
## 7 1890 62622250 40592
## 8 1900 75964575 24360
hearingPop <- censusCompare %>%
mutate(hearingpop = totalpop - deafpop)
hearingPop
## year totalpop deafpop hearingpop
## 1 1830 12896020 6106 12889914
## 2 1840 17000453 7623 16992830
## 3 1850 23191876 9863 23182013
## 4 1860 31463321 12871 31450450
## 5 1870 38558371 16205 38542166
## 6 1880 56155783 33878 56121905
## 7 1890 62622250 40592 62581658
## 8 1900 75964575 24360 75940215
# or I can use censusTidy and use spread() to extract the different pops
# from the respondants column and compare it using mutate
hearingPop2 <- censusTidy %>%
spread(respondant, results) %>%
select(year,
totalpop = respondants.total,
deafpop = respondants.deaf) %>%
mutate(hearingpop2 = totalpop - deafpop)
hearingPop2
## Source: local data frame [8 x 4]
##
## year totalpop deafpop hearingpop2
## 1 1830 12896020 6106 12889914
## 2 1840 17000453 7623 16992830
## 3 1850 23191876 9863 23182013
## 4 1860 31463321 12871 31450450
## 5 1870 38558371 16205 38542166
## 6 1880 56155783 33878 56121905
## 7 1890 62622250 40592 62581658
## 8 1900 75964575 24360 75940215
# or if I want to add up the total number of deaf respondants
# though this wouldn't be any indication of population size as a number of
# respondants would be potentially be counted a few times- but let's see what
# happens anyway
addDeaf <-hearingPop2 %>%
summarize(totaldeaf = sum(deafpop))
addDeaf
## Source: local data frame [1 x 1]
##
## totaldeaf
## 1 151498
# Ignore the failures below!
#Got it to work, but now ratio is in the same column as respondants and
# causing problems. Moving it around.
#censusTidy <- censusTidy %>%
# tbl_df() %>%
#select(Census.year,
# respondants.total = Total.Population,
# respondants.deaf = number.deaf,
# ratio = ratio.of.population) %>%
# gather(respondant, results, ratio, -Census.year)
#censusTidy
#Nope, now respondants are split in columns
# I'd rather just remove the ratio column altogether
# Just removing the ratio options doesn't work
#censusTidy <- censusTidy %>%
# tbl_df() %>%
#select(Census.year,
# respondants.total = Total.Population,
# respondants.deaf = number.deaf,
# - ratio.of.population) %>%
# gather(respondant, results, -Census.year)
#censusTidy