Summary of Assignment This assignment involves Tidying and Transforming flight data
This Assignment requires the following:
1). R-Studio
The following R-packages are used: 1.tidyr 2.dplyr 3.ggplot2
Steps to reproduce: 1). Place file locally: C:/mysqldata/Income_Distribution_by_Religion.csv
2). run the R-Studio file: R_607_Project_2_Wide_to_Long_Daniel_Thonn_2a.Rmd
Setting up and Preparing the Environment
#install.packages("stringr")
#install.packages("tidyr")
#install.packages("dplyr")
#install.packages("plyr")
#library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(plyr)
#detach("package:plyr", unload=TRUE)
library(ggplot2)
Load the character names data into an R dataframe, cleanup, and convert to long format
# Load the data csv file to a vector
c <- read.csv(
"C:/mysqldata/Income_Distribution_by_Religion.csv",
sep=",",
na.strings = "",
blank.lines.skip = TRUE,
stringsAsFactors=FALSE)
list(c)
## [[1]]
## Religious.tradition Less.than..30.000 X.30.000....49.999
## 1 Buddhist 36 18
## 2 Catholic 36 19
## 3 Evangelical Protestant 35 22
## 4 Hindu 17 13
## 5 Historically Black Protestant 53 22
## 6 Jehovah's Witness 48 25
## 7 Jewish 16 15
## 8 Mainline Protestant 29 20
## 9 Mormon 27 20
## 10 Muslim 34 17
## 11 Orthodox Christian 18 17
## 12 Unaffiliated 33 20
## X.50.000....99.000 X.100.000.or.more Sample.Size
## 1 32 13 233
## 2 26 19 6137
## 3 28 14 7462
## 4 34 36 172
## 5 17 8 1704
## 6 22 4 208
## 7 24 44 708
## 8 28 23 5208
## 9 33 20 594
## 10 29 20 205
## 11 36 29 155
## 12 26 21 6790
# Convert the input vector to a dataframe
df_Income_Religion_1 = data.frame(c)
df_Income_Religion_1
## Religious.tradition Less.than..30.000 X.30.000....49.999
## 1 Buddhist 36 18
## 2 Catholic 36 19
## 3 Evangelical Protestant 35 22
## 4 Hindu 17 13
## 5 Historically Black Protestant 53 22
## 6 Jehovah's Witness 48 25
## 7 Jewish 16 15
## 8 Mainline Protestant 29 20
## 9 Mormon 27 20
## 10 Muslim 34 17
## 11 Orthodox Christian 18 17
## 12 Unaffiliated 33 20
## X.50.000....99.000 X.100.000.or.more Sample.Size
## 1 32 13 233
## 2 26 19 6137
## 3 28 14 7462
## 4 34 36 172
## 5 17 8 1704
## 6 22 4 208
## 7 24 44 708
## 8 28 23 5208
## 9 33 20 594
## 10 29 20 205
## 11 36 29 155
## 12 26 21 6790
df_Income_Religion_1b = df_Income_Religion_1[, !(colnames(df_Income_Religion_1) %in% c("Sample.Size"))]
df_Income_Religion_1b
## Religious.tradition Less.than..30.000 X.30.000....49.999
## 1 Buddhist 36 18
## 2 Catholic 36 19
## 3 Evangelical Protestant 35 22
## 4 Hindu 17 13
## 5 Historically Black Protestant 53 22
## 6 Jehovah's Witness 48 25
## 7 Jewish 16 15
## 8 Mainline Protestant 29 20
## 9 Mormon 27 20
## 10 Muslim 34 17
## 11 Orthodox Christian 18 17
## 12 Unaffiliated 33 20
## X.50.000....99.000 X.100.000.or.more
## 1 32 13
## 2 26 19
## 3 28 14
## 4 34 36
## 5 17 8
## 6 22 4
## 7 24 44
## 8 28 23
## 9 33 20
## 10 29 20
## 11 36 29
## 12 26 21
names(df_Income_Religion_1b)[names(df_Income_Religion_1b) == "Less.than..30.000"] <- "Lessthan30"
names(df_Income_Religion_1b)[names(df_Income_Religion_1b) == "X.30.000....49.999"] <- "In30to49"
names(df_Income_Religion_1b)[names(df_Income_Religion_1b) == "X.50.000....99.000"] <- "In50to99"
names(df_Income_Religion_1b)[names(df_Income_Religion_1b) == "X.100.000.or.more"] <- "In100plus"
df_Income_Religion_1b
## Religious.tradition Lessthan30 In30to49 In50to99 In100plus
## 1 Buddhist 36 18 32 13
## 2 Catholic 36 19 26 19
## 3 Evangelical Protestant 35 22 28 14
## 4 Hindu 17 13 34 36
## 5 Historically Black Protestant 53 22 17 8
## 6 Jehovah's Witness 48 25 22 4
## 7 Jewish 16 15 24 44
## 8 Mainline Protestant 29 20 28 23
## 9 Mormon 27 20 33 20
## 10 Muslim 34 17 29 20
## 11 Orthodox Christian 18 17 36 29
## 12 Unaffiliated 33 20 26 21
names(df_Income_Religion_1b)
## [1] "Religious.tradition" "Lessthan30" "In30to49"
## [4] "In50to99" "In100plus"
# Convert wide dataframe into long dataframe
df_Income_Religion_1_long <- gather(df_Income_Religion_1b,IncomeRange,Percent_Income,Lessthan30,In30to49,In50to99,In100plus) %>% arrange(Religious.tradition)
df_Income_Religion_1_long
## Religious.tradition IncomeRange Percent_Income
## 1 Buddhist Lessthan30 36
## 2 Buddhist In30to49 18
## 3 Buddhist In50to99 32
## 4 Buddhist In100plus 13
## 5 Catholic Lessthan30 36
## 6 Catholic In30to49 19
## 7 Catholic In50to99 26
## 8 Catholic In100plus 19
## 9 Evangelical Protestant Lessthan30 35
## 10 Evangelical Protestant In30to49 22
## 11 Evangelical Protestant In50to99 28
## 12 Evangelical Protestant In100plus 14
## 13 Hindu Lessthan30 17
## 14 Hindu In30to49 13
## 15 Hindu In50to99 34
## 16 Hindu In100plus 36
## 17 Historically Black Protestant Lessthan30 53
## 18 Historically Black Protestant In30to49 22
## 19 Historically Black Protestant In50to99 17
## 20 Historically Black Protestant In100plus 8
## 21 Jehovah's Witness Lessthan30 48
## 22 Jehovah's Witness In30to49 25
## 23 Jehovah's Witness In50to99 22
## 24 Jehovah's Witness In100plus 4
## 25 Jewish Lessthan30 16
## 26 Jewish In30to49 15
## 27 Jewish In50to99 24
## 28 Jewish In100plus 44
## 29 Mainline Protestant Lessthan30 29
## 30 Mainline Protestant In30to49 20
## 31 Mainline Protestant In50to99 28
## 32 Mainline Protestant In100plus 23
## 33 Mormon Lessthan30 27
## 34 Mormon In30to49 20
## 35 Mormon In50to99 33
## 36 Mormon In100plus 20
## 37 Muslim Lessthan30 34
## 38 Muslim In30to49 17
## 39 Muslim In50to99 29
## 40 Muslim In100plus 20
## 41 Orthodox Christian Lessthan30 18
## 42 Orthodox Christian In30to49 17
## 43 Orthodox Christian In50to99 36
## 44 Orthodox Christian In100plus 29
## 45 Unaffiliated Lessthan30 33
## 46 Unaffiliated In30to49 20
## 47 Unaffiliated In50to99 26
## 48 Unaffiliated In100plus 21
# Plot-1 Range of values per IncomeRange
qplot(x=IncomeRange,y=Percent_Income, data=df_Income_Religion_1_long, geom="line")
# Plot-2 range of Percent_Income values per Religious.tradition
qplot(x=Religious.tradition,y=Percent_Income, data=df_Income_Religion_1_long, geom="line") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Plot-3 range of Percent_Income values per Religious.tradition by IncomeRange
ggplot(df_Income_Religion_1_long, aes(Religious.tradition, Percent_Income, group = IncomeRange, colour = IncomeRange)) + geom_path(alpha = 0.5) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Setup group by criteria by IncomeRange
By_Income_Range_1 <- group_by(df_Income_Religion_1_long,Religious.tradition,IncomeRange)
By_Income_Range_1
## Source: local data frame [48 x 3]
## Groups: Religious.tradition, IncomeRange [48]
##
## Religious.tradition IncomeRange Percent_Income
## <chr> <chr> <int>
## 1 Buddhist Lessthan30 36
## 2 Buddhist In30to49 18
## 3 Buddhist In50to99 32
## 4 Buddhist In100plus 13
## 5 Catholic Lessthan30 36
## 6 Catholic In30to49 19
## 7 Catholic In50to99 26
## 8 Catholic In100plus 19
## 9 Evangelical Protestant Lessthan30 35
## 10 Evangelical Protestant In30to49 22
## # ... with 38 more rows
By_Income_Range_1 %>% ggplot(aes(x=IncomeRange, y=Percent_Income)) + geom_line()
# Select columns for analysis dataframe by IncomeRange
By_Income_Range_2 <- select(By_Income_Range_1,Religious.tradition,IncomeRange,Percent_Income)
By_Income_Range_2
## Source: local data frame [48 x 3]
## Groups: Religious.tradition, IncomeRange [48]
##
## Religious.tradition IncomeRange Percent_Income
## <chr> <chr> <int>
## 1 Buddhist Lessthan30 36
## 2 Buddhist In30to49 18
## 3 Buddhist In50to99 32
## 4 Buddhist In100plus 13
## 5 Catholic Lessthan30 36
## 6 Catholic In30to49 19
## 7 Catholic In50to99 26
## 8 Catholic In100plus 19
## 9 Evangelical Protestant Lessthan30 35
## 10 Evangelical Protestant In30to49 22
## # ... with 38 more rows
# Present columns based on select and group-by criteria
By_Income_Range_3 <- summarise(By_Income_Range_2,Mean_Percent_Income = mean(Percent_Income))
By_Income_Range_3
## Source: local data frame [48 x 3]
## Groups: Religious.tradition [?]
##
## Religious.tradition IncomeRange Mean_Percent_Income
## <chr> <chr> <dbl>
## 1 Buddhist In100plus 13
## 2 Buddhist In30to49 18
## 3 Buddhist In50to99 32
## 4 Buddhist Lessthan30 36
## 5 Catholic In100plus 19
## 6 Catholic In30to49 19
## 7 Catholic In50to99 26
## 8 Catholic Lessthan30 36
## 9 Evangelical Protestant In100plus 14
## 10 Evangelical Protestant In30to49 22
## # ... with 38 more rows
# Plot-4: Graph Religious.tradition and Mean_Percent_Income per IncomeRange
By_Income_Range_3 %>%
ggplot(aes(Religious.tradition, Mean_Percent_Income, group = IncomeRange, colour = IncomeRange)) +
geom_path(alpha = 0.5) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
# Plot-5: Graph by Mean_Percent_Income per Income Range
By_Income_Range_3%>%
ggplot(aes(IncomeRange, y = Mean_Percent_Income,group = Religious.tradition, colour = Religious.tradition)) +
geom_path(alpha = 0.5) + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
Conclusion: The Percent_Income range is widest in the Lessthan30K and In100plus range income. Also as shown in Plot-2 the widest range of Percent_Income is widest among Historically Black Protestant and Jehovahs witness and narrowest among Mainline Protestant and Mormon. Plot-4 shows that the Mean_Percent_Income is lowest among Historically Black Protestant and highest among Jewish. Plot-5 shows that the widest disparities by religions is in the 100k plus range and in the Lessthan30k range.