Assignment 607_Project_2_Wide_to_Long_Transform_Birth Data_Daniel_Thonn

Summary of Assignment This assignment involves Tidying and Transforming data

This Assignment requires the following:

1). R-Studio

The following R-packages are used: 1.tidyr 2.dplyr 3.ggplot2

Steps to reproduce:

1). Place the file locally: C:/mysqldata/Wide_Birth_Data_2.csv

2). run the R-Studio file: R_607_Project_2_Wide_to_Long_Daniel_Thonn_3a.Rmd

Setting up and Preparing the Environment

#install.packages("stringr")
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("plyr")
#library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library(plyr)
#detach("package:plyr", unload=TRUE)
library(ggplot2)

Load the character names data into an R dataframe, cleanup, and convert to long format

# Load the data csv file to a vector
d <- read.csv(
              "C:/mysqldata/Wide_Birth_Data_2.csv",
              sep=",",
              na.strings = "",
              blank.lines.skip = TRUE,
                stringsAsFactors=FALSE)
list(d)
## [[1]]
##   caseid v012 b2_01 b2_02 b2_03 b4_01 b4_02 b4_03
## 1      1   30  2000  2005    NA     1     1    NA
## 2      2   29  2001  2010    NA     1     2    NA
## 3      3   32  1999  2002  2006     1     1     1
## 4      4   35  1999  2009    NA     2     1    NA
## 5      5   34  1998    NA    NA     2    NA    NA
## 6      6   23    NA    NA    NA    NA    NA    NA
## 7      7   25  2000    NA    NA     1    NA    NA
# Convert the input vector to a dataframe
df_Birth_Data_1 = data.frame(d)
df_Birth_Data_1
##   caseid v012 b2_01 b2_02 b2_03 b4_01 b4_02 b4_03
## 1      1   30  2000  2005    NA     1     1    NA
## 2      2   29  2001  2010    NA     1     2    NA
## 3      3   32  1999  2002  2006     1     1     1
## 4      4   35  1999  2009    NA     2     1    NA
## 5      5   34  1998    NA    NA     2    NA    NA
## 6      6   23    NA    NA    NA    NA    NA    NA
## 7      7   25  2000    NA    NA     1    NA    NA
df_Birth_Data_1b <- df_Birth_Data_1
df_Birth_Data_1b
##   caseid v012 b2_01 b2_02 b2_03 b4_01 b4_02 b4_03
## 1      1   30  2000  2005    NA     1     1    NA
## 2      2   29  2001  2010    NA     1     2    NA
## 3      3   32  1999  2002  2006     1     1     1
## 4      4   35  1999  2009    NA     2     1    NA
## 5      5   34  1998    NA    NA     2    NA    NA
## 6      6   23    NA    NA    NA    NA    NA    NA
## 7      7   25  2000    NA    NA     1    NA    NA
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "v012"] <- "MothersAge"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b2_01"] <- "BirthYear1"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b2_02"] <- "Birthyear2"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b2_03"] <- "Birthyear3"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b4_01"] <- "Sexbirth1"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b4_02"] <- "Sexbirth2"
names(df_Birth_Data_1b)[names(df_Birth_Data_1b) == "b4_03"] <- "Sexbirth3"


df_Birth_Data_1b
##   caseid MothersAge BirthYear1 Birthyear2 Birthyear3 Sexbirth1 Sexbirth2
## 1      1         30       2000       2005         NA         1         1
## 2      2         29       2001       2010         NA         1         2
## 3      3         32       1999       2002       2006         1         1
## 4      4         35       1999       2009         NA         2         1
## 5      5         34       1998         NA         NA         2        NA
## 6      6         23         NA         NA         NA        NA        NA
## 7      7         25       2000         NA         NA         1        NA
##   Sexbirth3
## 1        NA
## 2        NA
## 3         1
## 4        NA
## 5        NA
## 6        NA
## 7        NA
names(df_Birth_Data_1b)
## [1] "caseid"     "MothersAge" "BirthYear1" "Birthyear2" "Birthyear3"
## [6] "Sexbirth1"  "Sexbirth2"  "Sexbirth3"
# Convert wide dataframe into long dataframe - Birthyear
df_Birth_Data_1b_long <- gather(df_Birth_Data_1b,BirthyearCategory,Birthyear,BirthYear1,Birthyear2,Birthyear3) 
# %>% arrange(MothersAge)
df_Birth_Data_1b_long
##    caseid MothersAge Sexbirth1 Sexbirth2 Sexbirth3 BirthyearCategory
## 1       1         30         1         1        NA        BirthYear1
## 2       2         29         1         2        NA        BirthYear1
## 3       3         32         1         1         1        BirthYear1
## 4       4         35         2         1        NA        BirthYear1
## 5       5         34         2        NA        NA        BirthYear1
## 6       6         23        NA        NA        NA        BirthYear1
## 7       7         25         1        NA        NA        BirthYear1
## 8       1         30         1         1        NA        Birthyear2
## 9       2         29         1         2        NA        Birthyear2
## 10      3         32         1         1         1        Birthyear2
## 11      4         35         2         1        NA        Birthyear2
## 12      5         34         2        NA        NA        Birthyear2
## 13      6         23        NA        NA        NA        Birthyear2
## 14      7         25         1        NA        NA        Birthyear2
## 15      1         30         1         1        NA        Birthyear3
## 16      2         29         1         2        NA        Birthyear3
## 17      3         32         1         1         1        Birthyear3
## 18      4         35         2         1        NA        Birthyear3
## 19      5         34         2        NA        NA        Birthyear3
## 20      6         23        NA        NA        NA        Birthyear3
## 21      7         25         1        NA        NA        Birthyear3
##    Birthyear
## 1       2000
## 2       2001
## 3       1999
## 4       1999
## 5       1998
## 6         NA
## 7       2000
## 8       2005
## 9       2010
## 10      2002
## 11      2009
## 12        NA
## 13        NA
## 14        NA
## 15        NA
## 16        NA
## 17      2006
## 18        NA
## 19        NA
## 20        NA
## 21        NA
# Convert wide dataframe into long dataframe - SexBirth
df_Birth_Data_1c_long <- gather(df_Birth_Data_1b_long,SexBirthCategory,Gender,Sexbirth1,Sexbirth2,Sexbirth3) 
# %>% arrange(MothersAge)
df_Birth_Data_1c_long
##    caseid MothersAge BirthyearCategory Birthyear SexBirthCategory Gender
## 1       1         30        BirthYear1      2000        Sexbirth1      1
## 2       2         29        BirthYear1      2001        Sexbirth1      1
## 3       3         32        BirthYear1      1999        Sexbirth1      1
## 4       4         35        BirthYear1      1999        Sexbirth1      2
## 5       5         34        BirthYear1      1998        Sexbirth1      2
## 6       6         23        BirthYear1        NA        Sexbirth1     NA
## 7       7         25        BirthYear1      2000        Sexbirth1      1
## 8       1         30        Birthyear2      2005        Sexbirth1      1
## 9       2         29        Birthyear2      2010        Sexbirth1      1
## 10      3         32        Birthyear2      2002        Sexbirth1      1
## 11      4         35        Birthyear2      2009        Sexbirth1      2
## 12      5         34        Birthyear2        NA        Sexbirth1      2
## 13      6         23        Birthyear2        NA        Sexbirth1     NA
## 14      7         25        Birthyear2        NA        Sexbirth1      1
## 15      1         30        Birthyear3        NA        Sexbirth1      1
## 16      2         29        Birthyear3        NA        Sexbirth1      1
## 17      3         32        Birthyear3      2006        Sexbirth1      1
## 18      4         35        Birthyear3        NA        Sexbirth1      2
## 19      5         34        Birthyear3        NA        Sexbirth1      2
## 20      6         23        Birthyear3        NA        Sexbirth1     NA
## 21      7         25        Birthyear3        NA        Sexbirth1      1
## 22      1         30        BirthYear1      2000        Sexbirth2      1
## 23      2         29        BirthYear1      2001        Sexbirth2      2
## 24      3         32        BirthYear1      1999        Sexbirth2      1
## 25      4         35        BirthYear1      1999        Sexbirth2      1
## 26      5         34        BirthYear1      1998        Sexbirth2     NA
## 27      6         23        BirthYear1        NA        Sexbirth2     NA
## 28      7         25        BirthYear1      2000        Sexbirth2     NA
## 29      1         30        Birthyear2      2005        Sexbirth2      1
## 30      2         29        Birthyear2      2010        Sexbirth2      2
## 31      3         32        Birthyear2      2002        Sexbirth2      1
## 32      4         35        Birthyear2      2009        Sexbirth2      1
## 33      5         34        Birthyear2        NA        Sexbirth2     NA
## 34      6         23        Birthyear2        NA        Sexbirth2     NA
## 35      7         25        Birthyear2        NA        Sexbirth2     NA
## 36      1         30        Birthyear3        NA        Sexbirth2      1
## 37      2         29        Birthyear3        NA        Sexbirth2      2
## 38      3         32        Birthyear3      2006        Sexbirth2      1
## 39      4         35        Birthyear3        NA        Sexbirth2      1
## 40      5         34        Birthyear3        NA        Sexbirth2     NA
## 41      6         23        Birthyear3        NA        Sexbirth2     NA
## 42      7         25        Birthyear3        NA        Sexbirth2     NA
## 43      1         30        BirthYear1      2000        Sexbirth3     NA
## 44      2         29        BirthYear1      2001        Sexbirth3     NA
## 45      3         32        BirthYear1      1999        Sexbirth3      1
## 46      4         35        BirthYear1      1999        Sexbirth3     NA
## 47      5         34        BirthYear1      1998        Sexbirth3     NA
## 48      6         23        BirthYear1        NA        Sexbirth3     NA
## 49      7         25        BirthYear1      2000        Sexbirth3     NA
## 50      1         30        Birthyear2      2005        Sexbirth3     NA
## 51      2         29        Birthyear2      2010        Sexbirth3     NA
## 52      3         32        Birthyear2      2002        Sexbirth3      1
## 53      4         35        Birthyear2      2009        Sexbirth3     NA
## 54      5         34        Birthyear2        NA        Sexbirth3     NA
## 55      6         23        Birthyear2        NA        Sexbirth3     NA
## 56      7         25        Birthyear2        NA        Sexbirth3     NA
## 57      1         30        Birthyear3        NA        Sexbirth3     NA
## 58      2         29        Birthyear3        NA        Sexbirth3     NA
## 59      3         32        Birthyear3      2006        Sexbirth3      1
## 60      4         35        Birthyear3        NA        Sexbirth3     NA
## 61      5         34        Birthyear3        NA        Sexbirth3     NA
## 62      6         23        Birthyear3        NA        Sexbirth3     NA
## 63      7         25        Birthyear3        NA        Sexbirth3     NA
df_Birth_Data_1d_long <- df_Birth_Data_1c_long
df_Birth_Data_1d_long
##    caseid MothersAge BirthyearCategory Birthyear SexBirthCategory Gender
## 1       1         30        BirthYear1      2000        Sexbirth1      1
## 2       2         29        BirthYear1      2001        Sexbirth1      1
## 3       3         32        BirthYear1      1999        Sexbirth1      1
## 4       4         35        BirthYear1      1999        Sexbirth1      2
## 5       5         34        BirthYear1      1998        Sexbirth1      2
## 6       6         23        BirthYear1        NA        Sexbirth1     NA
## 7       7         25        BirthYear1      2000        Sexbirth1      1
## 8       1         30        Birthyear2      2005        Sexbirth1      1
## 9       2         29        Birthyear2      2010        Sexbirth1      1
## 10      3         32        Birthyear2      2002        Sexbirth1      1
## 11      4         35        Birthyear2      2009        Sexbirth1      2
## 12      5         34        Birthyear2        NA        Sexbirth1      2
## 13      6         23        Birthyear2        NA        Sexbirth1     NA
## 14      7         25        Birthyear2        NA        Sexbirth1      1
## 15      1         30        Birthyear3        NA        Sexbirth1      1
## 16      2         29        Birthyear3        NA        Sexbirth1      1
## 17      3         32        Birthyear3      2006        Sexbirth1      1
## 18      4         35        Birthyear3        NA        Sexbirth1      2
## 19      5         34        Birthyear3        NA        Sexbirth1      2
## 20      6         23        Birthyear3        NA        Sexbirth1     NA
## 21      7         25        Birthyear3        NA        Sexbirth1      1
## 22      1         30        BirthYear1      2000        Sexbirth2      1
## 23      2         29        BirthYear1      2001        Sexbirth2      2
## 24      3         32        BirthYear1      1999        Sexbirth2      1
## 25      4         35        BirthYear1      1999        Sexbirth2      1
## 26      5         34        BirthYear1      1998        Sexbirth2     NA
## 27      6         23        BirthYear1        NA        Sexbirth2     NA
## 28      7         25        BirthYear1      2000        Sexbirth2     NA
## 29      1         30        Birthyear2      2005        Sexbirth2      1
## 30      2         29        Birthyear2      2010        Sexbirth2      2
## 31      3         32        Birthyear2      2002        Sexbirth2      1
## 32      4         35        Birthyear2      2009        Sexbirth2      1
## 33      5         34        Birthyear2        NA        Sexbirth2     NA
## 34      6         23        Birthyear2        NA        Sexbirth2     NA
## 35      7         25        Birthyear2        NA        Sexbirth2     NA
## 36      1         30        Birthyear3        NA        Sexbirth2      1
## 37      2         29        Birthyear3        NA        Sexbirth2      2
## 38      3         32        Birthyear3      2006        Sexbirth2      1
## 39      4         35        Birthyear3        NA        Sexbirth2      1
## 40      5         34        Birthyear3        NA        Sexbirth2     NA
## 41      6         23        Birthyear3        NA        Sexbirth2     NA
## 42      7         25        Birthyear3        NA        Sexbirth2     NA
## 43      1         30        BirthYear1      2000        Sexbirth3     NA
## 44      2         29        BirthYear1      2001        Sexbirth3     NA
## 45      3         32        BirthYear1      1999        Sexbirth3      1
## 46      4         35        BirthYear1      1999        Sexbirth3     NA
## 47      5         34        BirthYear1      1998        Sexbirth3     NA
## 48      6         23        BirthYear1        NA        Sexbirth3     NA
## 49      7         25        BirthYear1      2000        Sexbirth3     NA
## 50      1         30        Birthyear2      2005        Sexbirth3     NA
## 51      2         29        Birthyear2      2010        Sexbirth3     NA
## 52      3         32        Birthyear2      2002        Sexbirth3      1
## 53      4         35        Birthyear2      2009        Sexbirth3     NA
## 54      5         34        Birthyear2        NA        Sexbirth3     NA
## 55      6         23        Birthyear2        NA        Sexbirth3     NA
## 56      7         25        Birthyear2        NA        Sexbirth3     NA
## 57      1         30        Birthyear3        NA        Sexbirth3     NA
## 58      2         29        Birthyear3        NA        Sexbirth3     NA
## 59      3         32        Birthyear3      2006        Sexbirth3      1
## 60      4         35        Birthyear3        NA        Sexbirth3     NA
## 61      5         34        Birthyear3        NA        Sexbirth3     NA
## 62      6         23        Birthyear3        NA        Sexbirth3     NA
## 63      7         25        Birthyear3        NA        Sexbirth3     NA
# Cleanup dataframe with appropriate values
df_Birth_Data_1d_long$Gender <- gsub("1", "M", df_Birth_Data_1d_long$Gender)
df_Birth_Data_1d_long$Gender <- gsub("2", "F", df_Birth_Data_1d_long$Gender)
df_Birth_Data_1d_long$Gender <- gsub("NA", "", df_Birth_Data_1d_long$Gender)
df_Birth_Data_1d_long$Birthyear <- gsub("NA", "", df_Birth_Data_1d_long$Birthyear)

df_Birth_Data_1d_long
##    caseid MothersAge BirthyearCategory Birthyear SexBirthCategory Gender
## 1       1         30        BirthYear1      2000        Sexbirth1      M
## 2       2         29        BirthYear1      2001        Sexbirth1      M
## 3       3         32        BirthYear1      1999        Sexbirth1      M
## 4       4         35        BirthYear1      1999        Sexbirth1      F
## 5       5         34        BirthYear1      1998        Sexbirth1      F
## 6       6         23        BirthYear1                  Sexbirth1       
## 7       7         25        BirthYear1      2000        Sexbirth1      M
## 8       1         30        Birthyear2      2005        Sexbirth1      M
## 9       2         29        Birthyear2      2010        Sexbirth1      M
## 10      3         32        Birthyear2      2002        Sexbirth1      M
## 11      4         35        Birthyear2      2009        Sexbirth1      F
## 12      5         34        Birthyear2                  Sexbirth1      F
## 13      6         23        Birthyear2                  Sexbirth1       
## 14      7         25        Birthyear2                  Sexbirth1      M
## 15      1         30        Birthyear3                  Sexbirth1      M
## 16      2         29        Birthyear3                  Sexbirth1      M
## 17      3         32        Birthyear3      2006        Sexbirth1      M
## 18      4         35        Birthyear3                  Sexbirth1      F
## 19      5         34        Birthyear3                  Sexbirth1      F
## 20      6         23        Birthyear3                  Sexbirth1       
## 21      7         25        Birthyear3                  Sexbirth1      M
## 22      1         30        BirthYear1      2000        Sexbirth2      M
## 23      2         29        BirthYear1      2001        Sexbirth2      F
## 24      3         32        BirthYear1      1999        Sexbirth2      M
## 25      4         35        BirthYear1      1999        Sexbirth2      M
## 26      5         34        BirthYear1      1998        Sexbirth2       
## 27      6         23        BirthYear1                  Sexbirth2       
## 28      7         25        BirthYear1      2000        Sexbirth2       
## 29      1         30        Birthyear2      2005        Sexbirth2      M
## 30      2         29        Birthyear2      2010        Sexbirth2      F
## 31      3         32        Birthyear2      2002        Sexbirth2      M
## 32      4         35        Birthyear2      2009        Sexbirth2      M
## 33      5         34        Birthyear2                  Sexbirth2       
## 34      6         23        Birthyear2                  Sexbirth2       
## 35      7         25        Birthyear2                  Sexbirth2       
## 36      1         30        Birthyear3                  Sexbirth2      M
## 37      2         29        Birthyear3                  Sexbirth2      F
## 38      3         32        Birthyear3      2006        Sexbirth2      M
## 39      4         35        Birthyear3                  Sexbirth2      M
## 40      5         34        Birthyear3                  Sexbirth2       
## 41      6         23        Birthyear3                  Sexbirth2       
## 42      7         25        Birthyear3                  Sexbirth2       
## 43      1         30        BirthYear1      2000        Sexbirth3       
## 44      2         29        BirthYear1      2001        Sexbirth3       
## 45      3         32        BirthYear1      1999        Sexbirth3      M
## 46      4         35        BirthYear1      1999        Sexbirth3       
## 47      5         34        BirthYear1      1998        Sexbirth3       
## 48      6         23        BirthYear1                  Sexbirth3       
## 49      7         25        BirthYear1      2000        Sexbirth3       
## 50      1         30        Birthyear2      2005        Sexbirth3       
## 51      2         29        Birthyear2      2010        Sexbirth3       
## 52      3         32        Birthyear2      2002        Sexbirth3      M
## 53      4         35        Birthyear2      2009        Sexbirth3       
## 54      5         34        Birthyear2                  Sexbirth3       
## 55      6         23        Birthyear2                  Sexbirth3       
## 56      7         25        Birthyear2                  Sexbirth3       
## 57      1         30        Birthyear3                  Sexbirth3       
## 58      2         29        Birthyear3                  Sexbirth3       
## 59      3         32        Birthyear3      2006        Sexbirth3      M
## 60      4         35        Birthyear3                  Sexbirth3       
## 61      5         34        Birthyear3                  Sexbirth3       
## 62      6         23        Birthyear3                  Sexbirth3       
## 63      7         25        Birthyear3                  Sexbirth3
# Plot-1 Range of values per Mothers Age and BirthyearCategory
qplot(x=MothersAge,y=BirthyearCategory, data=df_Birth_Data_1d_long, geom="line",colour = Gender)

# Plot-2 range of MothersAge values per BirthyearCategory
qplot(x=MothersAge,y=Gender, data=df_Birth_Data_1d_long, geom="line") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

# Plot-3 range of Percent_Income values per Religious.tradition by IncomeRange
ggplot(df_Birth_Data_1d_long, aes(BirthyearCategory,Gender, group = MothersAge, colour = MothersAge)) + geom_path(alpha = 0.5) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))

Conclusion: In Plot-2 it can be seen that more of the younger mothers had a Male child. In Plot-3 it can be seen that there are instances of a first child male and second child female, a first child female and second child male, and first child male and second child male, as well as first child female and second child female.

END