dplyr | tidyr |
---|---|
mutate | gather |
filter | spread |
arrange | separate |
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ggplot2)
populationEstimate_df = read.csv(file="02_USPopulationEstimate.csv", header=TRUE, sep=",")
head(populationEstimate_df, 5)
## SEX AGE CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
## 1 0 0 3944153 3944160 3951330 3963087 3926540 3931141 3949775 3978038
## 2 0 1 3978070 3978090 3957888 3966551 3977939 3942872 3949776 3968564
## 3 0 2 4096929 4096939 4090862 3971565 3980095 3992720 3959664 3966583
## 4 0 3 4119040 4119051 4111920 4102470 3983157 3992734 4007079 3974061
## 5 0 4 4063170 4063186 4077551 4122294 4112849 3994449 4005716 4020035
print(colnames(populationEstimate_df))
## [1] "SEX" "AGE" "CENSUS2010POP" "ESTIMATESBASE2010" "POPESTIMATE2010" "POPESTIMATE2011" "POPESTIMATE2012" "POPESTIMATE2013" "POPESTIMATE2014" "POPESTIMATE2015"
populationEstimate2_df = gather(populationEstimate_df, TYPE, POPULATION, CENSUS2010POP:POPESTIMATE2015)
colnames(populationEstimate2_df) = c("Gender", "Age", "Type", "Population")
populationEstimate2_df$Gender[populationEstimate2_df$Gender=="0"] = "Total"
populationEstimate2_df$Gender[populationEstimate2_df$Gender=="1"] = "Male"
populationEstimate2_df$Gender[populationEstimate2_df$Gender=="2"] = "Female"
head(populationEstimate2_df, 10)
## Gender Age Type Population
## 1 Total 0 CENSUS2010POP 3944153
## 2 Total 1 CENSUS2010POP 3978070
## 3 Total 2 CENSUS2010POP 4096929
## 4 Total 3 CENSUS2010POP 4119040
## 5 Total 4 CENSUS2010POP 4063170
## 6 Total 5 CENSUS2010POP 4056858
## 7 Total 6 CENSUS2010POP 4066381
## 8 Total 7 CENSUS2010POP 4030579
## 9 Total 8 CENSUS2010POP 4046486
## 10 Total 9 CENSUS2010POP 4148353
populationEstimate2_df$Type = gsub("CENSUS", "CENSUS_", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("POP", "", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("ESTIMATESBASE", "BASEESTIMATE", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("ESTIMATE", "ESTIMATE_", populationEstimate2_df$Type)
populationEstimate2_df = separate(populationEstimate2_df, Type, c("Type", "Year"), sep = "_")
populationEstimate2_df$Type = gsub("BASEESTIMATE", "BASE_ESTIMATE", populationEstimate2_df$Type)
head(populationEstimate2_df, 10)
## Gender Age Type Year Population
## 1 Total 0 CENSUS 2010 3944153
## 2 Total 1 CENSUS 2010 3978070
## 3 Total 2 CENSUS 2010 4096929
## 4 Total 3 CENSUS 2010 4119040
## 5 Total 4 CENSUS 2010 4063170
## 6 Total 5 CENSUS 2010 4056858
## 7 Total 6 CENSUS 2010 4066381
## 8 Total 7 CENSUS 2010 4030579
## 9 Total 8 CENSUS 2010 4046486
## 10 Total 9 CENSUS 2010 4148353
populationEstimate2_df = arrange(populationEstimate2_df, Gender)
head(populationEstimate2_df, 10)
## Gender Age Type Year Population
## 1 Female 0 CENSUS 2010 1929877
## 2 Female 1 CENSUS 2010 1947217
## 3 Female 2 CENSUS 2010 2004731
## 4 Female 3 CENSUS 2010 2014490
## 5 Female 4 CENSUS 2010 1985620
## 6 Female 5 CENSUS 2010 1984764
## 7 Female 6 CENSUS 2010 1991062
## 8 Female 7 CENSUS 2010 1973503
## 9 Female 8 CENSUS 2010 1981033
## 10 Female 9 CENSUS 2010 2028657
populationEstimate3_df = spread(populationEstimate2_df, Gender, Population)
head(populationEstimate3_df, 10)
## Age Type Year Female Male Total
## 1 0 BASE_ESTIMATE 2010 1929882 2014278 3944160
## 2 0 CENSUS 2010 1929877 2014276 3944153
## 3 0 ESTIMATE 2010 1932910 2018420 3951330
## 4 0 ESTIMATE 2011 1934660 2028427 3963087
## 5 0 ESTIMATE 2012 1918823 2007717 3926540
## 6 0 ESTIMATE 2013 1921613 2009528 3931141
## 7 0 ESTIMATE 2014 1929449 2020326 3949775
## 8 0 ESTIMATE 2015 1942904 2035134 3978038
## 9 1 BASE_ESTIMATE 2010 1947229 2030861 3978090
## 10 1 CENSUS 2010 1947217 2030853 3978070
calcPercentage = function(value, totalValue) {
round((value/totalValue)*100, 2)
}
populationEstimate3_df = mutate(populationEstimate3_df, PercentFemale=calcPercentage(Female, Total), PercentMale=calcPercentage(Male, Total))
head(populationEstimate3_df, 10)
## Age Type Year Female Male Total PercentFemale PercentMale
## 1 0 BASE_ESTIMATE 2010 1929882 2014278 3944160 48.93 51.07
## 2 0 CENSUS 2010 1929877 2014276 3944153 48.93 51.07
## 3 0 ESTIMATE 2010 1932910 2018420 3951330 48.92 51.08
## 4 0 ESTIMATE 2011 1934660 2028427 3963087 48.82 51.18
## 5 0 ESTIMATE 2012 1918823 2007717 3926540 48.87 51.13
## 6 0 ESTIMATE 2013 1921613 2009528 3931141 48.88 51.12
## 7 0 ESTIMATE 2014 1929449 2020326 3949775 48.85 51.15
## 8 0 ESTIMATE 2015 1942904 2035134 3978038 48.84 51.16
## 9 1 BASE_ESTIMATE 2010 1947229 2030861 3978090 48.95 51.05
## 10 1 CENSUS 2010 1947217 2030853 3978070 48.95 51.05
graph_df2 = filter (populationEstimate3_df, (Type == "CENSUS" & Age != 999))
head(graph_df2, 10)
## Age Type Year Female Male Total PercentFemale PercentMale
## 1 0 CENSUS 2010 1929877 2014276 3944153 48.93 51.07
## 2 1 CENSUS 2010 1947217 2030853 3978070 48.95 51.05
## 3 2 CENSUS 2010 2004731 2092198 4096929 48.93 51.07
## 4 3 CENSUS 2010 2014490 2104550 4119040 48.91 51.09
## 5 4 CENSUS 2010 1985620 2077550 4063170 48.87 51.13
## 6 5 CENSUS 2010 1984764 2072094 4056858 48.92 51.08
## 7 6 CENSUS 2010 1991062 2075319 4066381 48.96 51.04
## 8 7 CENSUS 2010 1973503 2057076 4030579 48.96 51.04
## 9 8 CENSUS 2010 1981033 2065453 4046486 48.96 51.04
## 10 9 CENSUS 2010 2028657 2119696 4148353 48.90 51.10
ggplot(data=graph_df2, aes(x=Age, y=PercentMale, color=PercentMale)) + geom_line() +
geom_point() + ylab("PercentMale") + ggtitle("Age Vs % Males")
# Question: Draw Males and Females in a chat and see the difference
graph_df0 = filter (populationEstimate2_df, (Type == "CENSUS" & Age != 999 & Gender != "Total"))
head(graph_df0, 10)
## Gender Age Type Year Population
## 1 Female 0 CENSUS 2010 1929877
## 2 Female 1 CENSUS 2010 1947217
## 3 Female 2 CENSUS 2010 2004731
## 4 Female 3 CENSUS 2010 2014490
## 5 Female 4 CENSUS 2010 1985620
## 6 Female 5 CENSUS 2010 1984764
## 7 Female 6 CENSUS 2010 1991062
## 8 Female 7 CENSUS 2010 1973503
## 9 Female 8 CENSUS 2010 1981033
## 10 Female 9 CENSUS 2010 2028657
ggplot(data=graph_df0, aes(x=Age, y=Population, group=Gender, color=Gender)) + geom_line() +
geom_point() + ylab("Population") + ggtitle("Age Vs Population")
(4.a) From the graphs we can conclude that the male population started to decline rapidly at age 75. And the population size of males is increased from age 0 to approximately age 27. Then the population of males and females is almost equal. At around age 50 the population of females increased and then it stayed more than the male population until age 100.