We make an attempt to acquire different datasets, process them into R readable format of data frame and perform data transformation in long /wide formats using tidyr / dplyr / reshape packages.
Loading Packages Used
knitr::opts_chunk$set(message = FALSE, echo = TRUE)
# Library to read data file
library(RCurl)
library(stringr)
library(knitr)
library(ggplot2)
# Library for data display in tabular format
library(DT)
library(tidyr)
library(dplyr)
library(reshape)Data available in HTML, JSON and XML formats are loaded.
The data files considered are in following format. Below are the github links to the data files.
birthdata.giturl <- "https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week6P2/birthdata.csv"
birthdata.gitdata <- getURL(birthdata.giturl)
birthdata <- read.csv2(text = birthdata.gitdata, header = T, sep = ",", dec = ".",
stringsAsFactors = FALSE)
# Verifying records and variables
nrow(birthdata)## [1] 7
ncol(birthdata)## [1] 8
head(birthdata)## caseid v012 b2_01 b2_02 b2_03 b4_01 b4_02 b4_03
## 1 1 30 2000 2005 NA 1 1 NA
## 2 2 29 2001 2010 NA 1 2 NA
## 3 3 32 1999 2002 2006 1 1 1
## 4 4 35 1999 2009 NA 2 1 NA
## 5 5 34 1998 NA NA 2 NA NA
## 6 6 23 NA NA NA NA NA NA
# Renaming Columns
colnames(birthdata) <- c("Case", "MotherAge", "BabyYear_1", "BabyYear_2", "BabyYear_3",
"BabyWt_1", "BabyWt_2", "BabyWt_3")
# Display data frame content
datatable(birthdata, rownames = FALSE)Using reshape package
The birth data is available on wide format. Converting to long with relevant columns , Using reshape() to convert to long format
Reshape() provides the best way to solve this kind of dataset. The column names need to follow a proper format to convert to long.
birthdatalong <- reshape(birthdata, varying = c("BabyYear_1", "BabyYear_2", "BabyYear_3",
"BabyWt_1", "BabyWt_2", "BabyWt_3"), direction = "long", idvar = "Case", sep = "_")
datatable(birthdatalong, rownames = FALSE)# Average baby Weight ForEeach Mother
birthdata_meanbabywt <- birthdatalong %>% select(Case, BabyWt) %>% group_by(Case) %>%
dplyr::summarise(AvgBabyWt = mean(BabyWt, na.rm = TRUE))## Warning in combine_vars(vars, ind_list): '.Random.seed' is not an integer
## vector but of type 'NULL', so ignored
birthdata_meanbabywt## # A tibble: 7 × 2
## Case AvgBabyWt
## <int> <dbl>
## 1 1 1.0
## 2 2 1.5
## 3 3 1.0
## 4 4 1.5
## 5 5 2.0
## 6 6 NaN
## 7 7 1.0
# Average Mother Age For Each BabyBirth
birthdata_meanbabywtage <- birthdatalong %>% select(time, MotherAge) %>% group_by(time) %>%
dplyr::summarise(AvgMotherAgeFirstBaby = mean(birthdatalong$MotherAge, na.rm = TRUE))
library(scales)## Warning: package 'scales' was built under R version 3.3.2
ggplot(birthdatalong, aes(y = birthdatalong$BabyWt, x = birthdatalong$MotherAge)) +
geom_point(aes(color = factor(Case), group = Case)) + scale_color_discrete(name = "Case") +
labs(title = "Mother Age Vs Baby Weight ", x = "Mother Age", y = "Baby Weight")## Warning: Removed 10 rows containing missing values (geom_point).
Since there are many NA values , and also the point may be over lapping in the plot.
Courtesy : Dataset from Sharon Morris
bike.giturl <- "https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week6P2/201606-citibike-tripdata.csv"
bike.gitdata <- getURL(bike.giturl)
bikedata <- read.csv2(text = bike.gitdata, header = T, sep = ",", dec = ".", stringsAsFactors = FALSE)
# Verifying records and variables
nrow(bikedata)## [1] 65499
ncol(bikedata)## [1] 15
datatable(head(bikedata))############################################################################### Lets select only the columns that we need to focus on for gender based trip duration between stations
head(bikedata)## tripduration starttime stoptime start.station.id
## 1 1470 6/1/2016 00:00:18 6/1/2016 00:24:48 380
## 2 229 6/1/2016 00:00:20 6/1/2016 00:04:09 3092
## 3 344 6/1/2016 00:00:21 6/1/2016 00:06:06 449
## 4 1120 6/1/2016 00:00:28 6/1/2016 00:19:09 522
## 5 229 6/1/2016 00:00:53 6/1/2016 00:04:42 335
## 6 946 6/1/2016 00:01:01 6/1/2016 00:16:48 503
## start.station.name start.station.latitude start.station.longitude
## 1 W 4 St & 7 Ave S 40.73401 -74.00294
## 2 Berry St & N 8 St 40.71901 -73.95853
## 3 W 52 St & 9 Ave 40.76462 -73.98789
## 4 E 51 St & Lexington Ave 40.75715 -73.97208
## 5 Washington Pl & Broadway 40.72904 -73.99405
## 6 E 20 St & Park Ave 40.73827 -73.98752
## end.station.id end.station.name end.station.latitude
## 1 3236 W 42 St & Dyer Ave 40.75898
## 2 3103 N 11 St & Wythe Ave 40.72153
## 3 469 Broadway & W 53 St 40.76344
## 4 401 Allen St & Rivington St 40.72020
## 5 285 Broadway & E 14 St 40.73455
## 6 495 W 47 St & 10 Ave 40.76270
## end.station.longitude bikeid usertype birth.year gender
## 1 -73.99380 19859 Subscriber 1972 1
## 2 -73.95782 16233 Subscriber 1967 1
## 3 -73.98268 22397 Subscriber 1989 1
## 4 -73.98998 16231 Subscriber 1991 1
## 5 -73.99074 15400 Subscriber 1989 1
## 6 -73.99301 25193 Subscriber 1974 1
colnames(bikedata) <- c("tripduration", "starttime", "stoptime", "startid", "startname",
"startlat", "startlong", "stopid", "stopname", "stoplat", "stoplong", "bikeid",
"usertype", "birthyear", "gender")
bikedata <- bikedata %>% dplyr::select(tripduration, starttime, stoptime, startid,
stopid, usertype, gender)
############################################################################### # Since the data is too large we filter the same. Choosing trip duration(since it
# is in seconds) for greater than 5 hours
bikedatasub_18k <- bikedata %>% dplyr::filter(tripduration >= 18000)
# Adding Column of tripduration in hours
bikedatasub_triphr <- bikedatasub_18k %>% mutate(round(triphours = tripduration/3600))
# caclulate mean time taken between Stations :Lexington Ave And Pearl St from
# base bike data
bikedatasub2 <- bikedata %>% filter(startid == "120" & stopid == "82") %>% mutate(tripname = "LexPearl") %>%
summarize(Meantime = mean(tripduration, na.rm = TRUE))
bikedatasub2## Meantime
## 1 37175
# Finding total number of different trips in bikedata set for above 5 hours and
# the mean time for each trip and arrange descending mean trip duration
bikedatasub_tripsnmean <- bikedatasub_18k %>% group_by(startid, stopid) %>% summarize(total = n(),
tripmeantime = mean(tripduration, na.rm = TRUE)) %>% arrange(desc(tripmeantime))
datatable(bikedatasub_tripsnmean)# grouping by usertype
bikedatasub3 <- bikedatasub_18k %>% group_by(usertype) %>% dplyr::summarise(avgdurn = mean(bikedatasub_18k$tripduration,
totaltrip = n()))
############################################################################### (Courtesy : Dataset from Marco Sequeira)
relinc.giturl <- "https://raw.githubusercontent.com/DataDriven-MSDA/DATA607/master/Week6P2/religionincome.csv"
relinc.gitdata <- getURL(relinc.giturl)
relincdata <- read.csv2(text = relinc.gitdata, header = T, sep = ",", dec = ".",
stringsAsFactors = FALSE)
# Verifying records and variables
nrow(relincdata)## [1] 12
ncol(relincdata)## [1] 6
head(relincdata)## Religious.tradition Less.than..30.000 X.30.000..49.999
## 1 Buddhist 36% 18%
## 2 Catholic 36% 19%
## 3 Evangelical Protestant 35% 22%
## 4 Hindu 17% 13%
## 5 Historically Black Protestant 53% 22%
## 6 Jehovah's Witness 48% 25%
## X.50.000..99.999 X.100.000.or.more Sample.Size
## 1 32% 13% 233
## 2 26% 19% 6,137
## 3 28% 14% 7,462
## 4 34% 36% 172
## 5 17% 8% 1,704
## 6 22% 4% 208
############################################################################### # Renaming Columns
colnames(relincdata) <- c("Tradition", "ForLessThan30", "From30To49", "From50To99",
"MoreThan100", "Sample")
# Removing % sign from the Columns
relincdata$ForLessThan30 <- as.numeric(str_extract(relincdata$ForLessThan30, "[[:digit:]]{2,}"))
relincdata$From30To49 <- as.numeric(str_extract(relincdata$From30To49, "[[:digit:]]{2,}"))
relincdata$From50To99 <- as.numeric(str_extract(relincdata$From50To99, "[[:digit:]]{2,}"))
relincdata$MoreThan100 <- as.numeric(str_extract(relincdata$MoreThan100, "[[:digit:]]{2,}"))
# Removing , from Sample
relincdata$Sample <- as.numeric(gsub(",", "", relincdata$Sample))
# Display data frame content
datatable(relincdata, rownames = FALSE)############################################################################### Using tidyr package
Using gather() to convert to long format
tidyrelincdata <- relincdata %>% gather("Category", "Pct", 2:5, na.rm = FALSE)
tidyrelincdata$PctVal = round(((as.numeric(tidyrelincdata$Pct)/100) * as.numeric(tidyrelincdata$Sample)),
2)
datatable(tidyrelincdata)G <- ggplot(tidyrelincdata, aes(x = tidyrelincdata$Category, y = tidyrelincdata$PctVal)) +
geom_point(aes(color = Tradition)) + ggtitle("Religion And Income Categoris") +
xlab("Category") + ylab("People In Category")
G## Warning: Removed 2 rows containing missing values (geom_point).
From the plot we notice that Evangelist Protestant followers are the maximum working in less than 30K category as well as in the 50k - 99k category
While themore than 100 K category is led by Unaffiliated followers.
The least in each category are Orthodox Christian followers.