Project2

Annual US Population Estimates

The dataset for this project is downloaded from https://www.census.gov/popest/data/datasets.html. This is the annual estimates of the resident population by single year of age and gender for the United States for 04/01/2010 to 07/01/2015.

The objective of this project is to analyze the population data and how it changes for each gender and age group:

The following dplyr/tidyr functions are used in this project

dplyr	tidyr
mutate	gather
filter	spread
arrange	separate

(1) Save the data as a CSV file:

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

## Loading required package: DBI

library(ggplot2)

(2) Read the information from the .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data:

**(2.a) Read 02_USPopulationEstimate.csv file**

populationEstimate_df = read.csv(file="02_USPopulationEstimate.csv", header=TRUE, sep=",")
head(populationEstimate_df, 5)

##   SEX AGE CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
## 1   0   0       3944153           3944160         3951330         3963087         3926540         3931141         3949775         3978038
## 2   0   1       3978070           3978090         3957888         3966551         3977939         3942872         3949776         3968564
## 3   0   2       4096929           4096939         4090862         3971565         3980095         3992720         3959664         3966583
## 4   0   3       4119040           4119051         4111920         4102470         3983157         3992734         4007079         3974061
## 5   0   4       4063170           4063186         4077551         4122294         4112849         3994449         4005716         4020035

print(colnames(populationEstimate_df))

##  [1] "SEX"               "AGE"               "CENSUS2010POP"     "ESTIMATESBASE2010" "POPESTIMATE2010"   "POPESTIMATE2011"   "POPESTIMATE2012"   "POPESTIMATE2013"   "POPESTIMATE2014"   "POPESTIMATE2015"

**(2.b) Apply tidyr gather function to change the data format from wide to long and change column names.**

populationEstimate2_df = gather(populationEstimate_df, TYPE, POPULATION, CENSUS2010POP:POPESTIMATE2015)
colnames(populationEstimate2_df) = c("Gender", "Age", "Type", "Population")

(2.c) Replace numbers with words to represent gender. In the original data “0” means Total, “1” means Males and “2” means Females.

populationEstimate2_df$Gender[populationEstimate2_df$Gender=="0"] = "Total"
populationEstimate2_df$Gender[populationEstimate2_df$Gender=="1"] = "Male"
populationEstimate2_df$Gender[populationEstimate2_df$Gender=="2"] = "Female"
head(populationEstimate2_df, 10)

##    Gender Age          Type Population
## 1   Total   0 CENSUS2010POP    3944153
## 2   Total   1 CENSUS2010POP    3978070
## 3   Total   2 CENSUS2010POP    4096929
## 4   Total   3 CENSUS2010POP    4119040
## 5   Total   4 CENSUS2010POP    4063170
## 6   Total   5 CENSUS2010POP    4056858
## 7   Total   6 CENSUS2010POP    4066381
## 8   Total   7 CENSUS2010POP    4030579
## 9   Total   8 CENSUS2010POP    4046486
## 10  Total   9 CENSUS2010POP    4148353

**(2.d) Perform data manipulation to extract appropriate data contents. Then apply tidyr separate function to split the date column from Type column.**

populationEstimate2_df$Type = gsub("CENSUS", "CENSUS_", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("POP", "", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("ESTIMATESBASE", "BASEESTIMATE", populationEstimate2_df$Type)
populationEstimate2_df$Type = gsub("ESTIMATE", "ESTIMATE_", populationEstimate2_df$Type)
populationEstimate2_df = separate(populationEstimate2_df, Type, c("Type", "Year"), sep = "_")
populationEstimate2_df$Type = gsub("BASEESTIMATE", "BASE_ESTIMATE", populationEstimate2_df$Type)
head(populationEstimate2_df, 10)

##    Gender Age   Type Year Population
## 1   Total   0 CENSUS 2010    3944153
## 2   Total   1 CENSUS 2010    3978070
## 3   Total   2 CENSUS 2010    4096929
## 4   Total   3 CENSUS 2010    4119040
## 5   Total   4 CENSUS 2010    4063170
## 6   Total   5 CENSUS 2010    4056858
## 7   Total   6 CENSUS 2010    4066381
## 8   Total   7 CENSUS 2010    4030579
## 9   Total   8 CENSUS 2010    4046486
## 10  Total   9 CENSUS 2010    4148353

**(2.e) Apply dplyr arrange function and to sort the data on Gender**

populationEstimate2_df = arrange(populationEstimate2_df, Gender)
head(populationEstimate2_df, 10)

##    Gender Age   Type Year Population
## 1  Female   0 CENSUS 2010    1929877
## 2  Female   1 CENSUS 2010    1947217
## 3  Female   2 CENSUS 2010    2004731
## 4  Female   3 CENSUS 2010    2014490
## 5  Female   4 CENSUS 2010    1985620
## 6  Female   5 CENSUS 2010    1984764
## 7  Female   6 CENSUS 2010    1991062
## 8  Female   7 CENSUS 2010    1973503
## 9  Female   8 CENSUS 2010    1981033
## 10 Female   9 CENSUS 2010    2028657

**(2.f) Apply tidyr spread function to spread gender in 2 different columns, namely Male and Female.**

populationEstimate3_df = spread(populationEstimate2_df, Gender, Population)
head(populationEstimate3_df, 10)

##    Age          Type Year  Female    Male   Total
## 1    0 BASE_ESTIMATE 2010 1929882 2014278 3944160
## 2    0        CENSUS 2010 1929877 2014276 3944153
## 3    0      ESTIMATE 2010 1932910 2018420 3951330
## 4    0      ESTIMATE 2011 1934660 2028427 3963087
## 5    0      ESTIMATE 2012 1918823 2007717 3926540
## 6    0      ESTIMATE 2013 1921613 2009528 3931141
## 7    0      ESTIMATE 2014 1929449 2020326 3949775
## 8    0      ESTIMATE 2015 1942904 2035134 3978038
## 9    1 BASE_ESTIMATE 2010 1947229 2030861 3978090
## 10   1        CENSUS 2010 1947217 2030853 3978070

(2.g) Create a function to calculate the percentage and round it to 2 decimal digits

calcPercentage = function(value, totalValue) {
  round((value/totalValue)*100, 2)
}

**(2.h) Apply tidyr mutate function to add 2 different columns, namely PercentMale and PercentFemale.**

populationEstimate3_df = mutate(populationEstimate3_df, PercentFemale=calcPercentage(Female, Total), PercentMale=calcPercentage(Male, Total))
head(populationEstimate3_df, 10)

##    Age          Type Year  Female    Male   Total PercentFemale PercentMale
## 1    0 BASE_ESTIMATE 2010 1929882 2014278 3944160         48.93       51.07
## 2    0        CENSUS 2010 1929877 2014276 3944153         48.93       51.07
## 3    0      ESTIMATE 2010 1932910 2018420 3951330         48.92       51.08
## 4    0      ESTIMATE 2011 1934660 2028427 3963087         48.82       51.18
## 5    0      ESTIMATE 2012 1918823 2007717 3926540         48.87       51.13
## 6    0      ESTIMATE 2013 1921613 2009528 3931141         48.88       51.12
## 7    0      ESTIMATE 2014 1929449 2020326 3949775         48.85       51.15
## 8    0      ESTIMATE 2015 1942904 2035134 3978038         48.84       51.16
## 9    1 BASE_ESTIMATE 2010 1947229 2030861 3978090         48.95       51.05
## 10   1        CENSUS 2010 1947217 2030853 3978070         48.95       51.05

(3) Prepare data for analysis:

**(3.a) Apply dplyr filter function to get Census data and display Age vs % of males**

graph_df2 = filter (populationEstimate3_df, (Type == "CENSUS" & Age != 999))
head(graph_df2, 10)

##    Age   Type Year  Female    Male   Total PercentFemale PercentMale
## 1    0 CENSUS 2010 1929877 2014276 3944153         48.93       51.07
## 2    1 CENSUS 2010 1947217 2030853 3978070         48.95       51.05
## 3    2 CENSUS 2010 2004731 2092198 4096929         48.93       51.07
## 4    3 CENSUS 2010 2014490 2104550 4119040         48.91       51.09
## 5    4 CENSUS 2010 1985620 2077550 4063170         48.87       51.13
## 6    5 CENSUS 2010 1984764 2072094 4056858         48.92       51.08
## 7    6 CENSUS 2010 1991062 2075319 4066381         48.96       51.04
## 8    7 CENSUS 2010 1973503 2057076 4030579         48.96       51.04
## 9    8 CENSUS 2010 1981033 2065453 4046486         48.96       51.04
## 10   9 CENSUS 2010 2028657 2119696 4148353         48.90       51.10

ggplot(data=graph_df2, aes(x=Age, y=PercentMale, color=PercentMale)) +  geom_line() + 
  geom_point() + ylab("PercentMale") +  ggtitle("Age Vs % Males")

**(3.b) Apply dplyr filter function to get Census data for all genders and display Age vs males and female population**

# Question: Draw Males and Females in a chat and see the difference
graph_df0 = filter (populationEstimate2_df, (Type == "CENSUS" & Age != 999 & Gender != "Total"))
head(graph_df0, 10)

##    Gender Age   Type Year Population
## 1  Female   0 CENSUS 2010    1929877
## 2  Female   1 CENSUS 2010    1947217
## 3  Female   2 CENSUS 2010    2004731
## 4  Female   3 CENSUS 2010    2014490
## 5  Female   4 CENSUS 2010    1985620
## 6  Female   5 CENSUS 2010    1984764
## 7  Female   6 CENSUS 2010    1991062
## 8  Female   7 CENSUS 2010    1973503
## 9  Female   8 CENSUS 2010    1981033
## 10 Female   9 CENSUS 2010    2028657

ggplot(data=graph_df0, aes(x=Age, y=Population, group=Gender, color=Gender)) +  geom_line() + 
  geom_point() + ylab("Population") +  ggtitle("Age Vs Population")

(4) Conclusion:

(4.a) From the graphs we can conclude that the male population started to decline rapidly at age 75. And the population size of males is increased from age 0 to approximately age 27. Then the population of males and females is almost equal. At around age 50 the population of females increased and then it stayed more than the male population until age 100.

Project2_02

A.S

October 7, 2016