The dataset from the post of Emilie Bolduc
Untidy data is sorted to answer following questions:
1) Where in the US has there been the most interest in yoga?
2) When people were most interested in yoga in the last 10 years?
3) Interactive map showing the popularity of Yoga by state.
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
## Google Maps API Terms of Service: http://developers.google.com/maps/terms.
## Please cite ggmap if you use it: see citation("ggmap") for details.
library(leaflet)
data <- read.csv("yoga.csv")
data <- data[-1,]
head(data)
## X Alabama..us.al. Alaska..us.ak. Arizona..us.az. Arkansas..us.ar.
## 2 2004-01 20 23 21 24
## 3 2004-02 8 26 25 16
## 4 2004-03 10 26 22 26
## 5 2004-04 15 34 24 18
## 6 2004-05 15 14 24 11
## 7 2004-06 12 21 23 14
## California..us.ca. Colorado..us.co. Connecticut..us.ct. Delaware..us.de.
## 2 32 33 27 47
## 3 27 30 26 28
## 4 28 29 30 51
## 5 25 27 22 27
## 6 25 24 25 20
## 7 27 27 26 22
## District.of.Columbia..us.dc. Florida..us.fl. Georgia..us.ga.
## 2 32 21 21
## 3 36 17 20
## 4 29 17 20
## 5 29 19 15
## 6 24 20 16
## 7 26 18 18
## Hawaii..us.hi. Idaho..us.id. Illinois..us.il. Indiana..us.in.
## 2 36 21 25 24
## 3 24 22 23 14
## 4 36 21 25 17
## 5 30 18 17 14
## 6 39 14 17 9
## 7 44 17 20 18
## Iowa..us.ia. Kansas..us.ks. Kentucky..us.ky. Louisiana..us.la.
## 2 14 20 17 20
## 3 16 12 19 15
## 4 18 13 18 17
## 5 19 19 12 20
## 6 14 15 15 14
## 7 17 18 17 16
## Maine..us.me. Maryland..us.md. Massachusetts..us.ma. Michigan..us.mi.
## 2 29 26 41 19
## 3 29 23 33 18
## 4 26 22 32 17
## 5 21 17 31 15
## 6 25 21 27 15
## 7 22 23 29 17
## Minnesota..us.mn. Mississippi..us.ms. Missouri..us.mo. Montana..us.mt.
## 2 26 16 19 44
## 3 22 20 18 26
## 4 21 18 15 41
## 5 17 18 12 25
## 6 22 13 15 24
## 7 18 18 20 21
## Nebraska..us.ne. Nevada..us.nv. New.Hampshire..us.nh. New.Jersey..us.nj.
## 2 15 20 45 27
## 3 21 25 20 22
## 4 16 24 22 23
## 5 16 17 26 20
## 6 14 25 23 22
## 7 15 23 27 21
## New.Mexico..us.nm. New.York..us.ny. North.Carolina..us.nc.
## 2 33 35 23
## 3 25 28 22
## 4 26 29 20
## 5 18 26 18
## 6 21 28 16
## 7 32 28 22
## North.Dakota..us.nd. Ohio..us.oh. Oklahoma..us.ok. Oregon..us.or.
## 2 52 19 22 34
## 3 45 16 19 30
## 4 45 15 12 28
## 5 45 13 11 22
## 6 45 16 17 25
## 7 45 16 17 27
## Pennsylvania..us.pa. Rhode.Island..us.ri. South.Carolina..us.sc.
## 2 19 44 24
## 3 18 26 19
## 4 20 27 18
## 5 19 31 14
## 6 16 26 11
## 7 20 33 19
## South.Dakota..us.sd. Tennessee..us.tn. Texas..us.tx. Utah..us.ut.
## 2 25 21 24 26
## 3 22 18 16 20
## 4 21 16 17 10
## 5 21 21 16 20
## 6 28 15 17 19
## 7 28 16 17 16
## Vermont..us.vt. Virginia..us.va. Washington..us.wa.
## 2 42 22 30
## 3 39 16 29
## 4 41 19 27
## 5 37 17 25
## 6 31 18 25
## 7 34 22 27
## West.Virginia..us.wv. Wisconsin..us.wi. Wyoming..us.wy.
## 2 23 18 0
## 3 17 17 37
## 4 27 20 35
## 5 26 17 37
## 6 21 18 35
## 7 28 19 35
#str(data)
colnames(data) <- c("Year","Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","DC","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansus","Kentucky","Louisiana","Maine","Maryland","Masachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","NH","New_Jersey","New_Mexico","New_York","North_Carolina","North_Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode_Island","South_Carolina","South_Dakota","Tennessse","Texas","Utah","Vermont","Virginia","Washington","West_Virginia","Wisconsin","Wyoming")
data1 <-data %>%
separate(Year,into = c("Year","month"),sep = "-")
data1 <- data1[,-2]
data2 <- data1 %>%
group_by(Year)%>%
summarise_all(funs(sum))
data2 <- data.frame(data2)
data3 <- setNames(data.frame(t(data2[,-1])),data2[,1])
for(i in 1:51) {
data3$total[i] <- rowSums(data3[i,1:13],na.rm = TRUE)
}
head(data3)
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
## Alabama 151 153 166 136 127 134 162 144 163 181 211 236
## Alaska 267 276 252 280 273 306 427 416 433 481 512 582
## Arizona 271 270 255 245 263 271 300 264 283 288 307 323
## Arkansas 204 185 176 163 139 159 209 168 189 240 259 256
## California 321 309 301 287 266 257 234 291 301 320 329 329
## Colorado 337 353 369 347 332 328 367 360 374 410 409 437
## 2016 total
## Alabama 78 2042
## Alaska 221 4726
## Arizona 113 3453
## Arkansas 77 2424
## California 114 3659
## Colorado 155 4578
max(data3$total)
## [1] 7529
Vermont(7529) had highest practice of yoga in last 10 years.
data3[52,] = c(colSums(data3[,1:14]))
max(data3[52,-14])
## [1] 18475
Year 2015 recorded highest number of yoga practisioners.
data3$Regions <- c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","DC","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansus","Kentucky","Louisiana","Maine","Maryland","Masachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","NH","New_Jersey","New_Mexico","New_York","North_Carolina","North_Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode_Island","South_Carolina","South_Dakota","Tennessse","Texas","Utah","Vermont","Virginia","Washington","West_Virginia","Wisconsin","Wyoming","Total1")
data3$total <- as.numeric(data3$total)
data3$Regions <- as.character(data3$Regions)
Getting latitudes and longitudes of all states in data frame.
for (i in 1:nrow(data3)) {
latlon = geocode(data3[i,15])
data3$lon[i] = as.numeric(latlon[1])
data3$lat[i] = as.numeric(latlon[2])
}
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Alabama
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Alaska
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Arizona
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Arkansas
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=California
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Colorado
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Connecticut
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Delaware
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=DC
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Florida
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Georgia
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Hawaii
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Idaho
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Illinois
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Indiana
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Iowa
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Kansus
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Kentucky
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Louisiana
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Maine
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Maryland
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Masachusetts
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Michigan
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Minnesota
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Mississippi
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Missouri
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Montana
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Nebraska
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Nevada
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=NH
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New_Jersey
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New_Mexico
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New_York
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=North_Carolina
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=North_Dakota
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Ohio
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Oklahoma
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Oregon
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Pennsylvania
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Rhode_Island
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=South_Carolina
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=South_Dakota
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Tennessse
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Texas
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Utah
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Vermont
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Virginia
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Washington
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=West_Virginia
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Wisconsin
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Wyoming
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Total1
Popularity of Yoga by states
data4 <- data.frame(data3$total, data3$lon, data3$lat,data3$Regions)
colnames(data4) = c('total','lon','lat','Region')
m <- leaflet() %>%
addTiles() %>%
addMarkers(lng = data4$lon,lat = data4$lat,popup = paste("Total",data4$total) %>%
paste("Region",data4$Region))
m
#### Clicking on pins will give the total number of yoga followers in the state.
II) My second dataset is from Chad Smith based on ‘catfish’ survey.I calculated the yearwise and size category wise numbers.Starting with crunching data and tidying it.
Link to dataset: CatfishFarm.xlsx
CFData <- read.csv("CatfishFarm.csv")
colnames(CFData) = as.character(unlist(CFData[1,]))
CFData = CFData[-1,]
CFData = CFData[-7:-8,]
CFData
## Size category 1992 1993 1994 1995 1996 1997
## 2 Broodfish 1/ 1,491 1,169 1,183 1,301 1,171 1,163
## 3 Fingerling/fry 2/ 849,412 669,491 648,628 724,693 823,397 873,457
## 4 Stockers 3/ 634,353 571,254 548,207 554,342 627,834 754,816
## 5 Small foodsize 4/ 166,731 153,600 134,314 138,160 156,297 178,448
## 6 Medium foodsize 5/ 70,495 61,894 48,851 59,159 64,858 84,725
## 7 Large foodsize 6/ 6,769 6,698 5,196 4,536 6,644 7,810
## 1998 1999 2000 2001 2002 2003 2004 2005
## 2 1,187 1,155 1,377 1,327 1,171 1,303 1,113 1,053
## 3 975,542 986,368 1,053,300 1,023,533 1,066,400 990,163 745,849 712,144
## 4 607,878 678,682 790,683 845,287 676,378 775,226 890,275 660,000
## 5 178,511 182,251 200,032 239,655 287,591 254,920 261,323 243,090
## 6 62,140 63,049 77,149 87,926 106,117 127,908 109,120 95,240
## 7 7,295 9,266 5,812 6,872 10,746 11,195 10,947 10,642
## 2006 2007 2008 2009 2010 2011 2012 2013
## 2 1,091 886 801 704 536 495 562 540
## 3 1,045,266 985,620 951,910 728,340 429,590 568,990 451,100 398,510
## 4 781,958 586,320 688,844 586,069 366,090 380,660 463,485 339,260
## 5 214,848 210,340 204,750 193,870 169,030 115,560 112,970 103,520
## 6 103,591 104,080 107,800 105,610 91,790 54,130 64,740 58,015
## 7 10,823 8,986 9,290 9,316 8,570 6,212 3,595 5,155
## 2014 2015 2016
## 2 650 577 520
## 3 420,060 449,510 328,570
## 4 289,080 248,790 204,800
## 5 102,190 96,810 100,850
## 6 50,600 48,220 45,775
## 7 4,500 5,090 3,520
The graph makes it clear that there is much variation in amount of size -“Fingerling/fry” over the years.Th prices for Broodfish and and Large foodsize fishes are constant.
To analyze data yearwise,we choose to use “Spread” function.
TSCFdata <- spread(TCFData,`Size category`,`Budget`)
TSCFdata <- data.frame(TSCFdata)
head(TSCFdata)
## Year Broodfish.1. Fingerling.fry.2. Large.foodsize.6. Medium.foodsize.5.
## 1 1992 1491 849412 6769 70495
## 2 1993 1169 669491 6698 61894
## 3 1994 1183 648628 5196 48851
## 4 1995 1301 724693 4536 59159
## 5 1996 1171 823397 6644 64858
## 6 1997 1163 873457 7810 84725
## Small.foodsize.4. Stockers.3.
## 1 166731 634353
## 2 153600 571254
## 3 134314 548207
## 4 138160 554342
## 5 156297 627834
## 6 178448 754816
TSCFdata <- na.omit(TSCFdata)
TSCFdata$Broodfish.1. <- as.numeric(gsub(",","",TSCFdata$Broodfish.1.))
TSCFdata$Fingerling.fry.2. <- as.numeric(gsub(",","",TSCFdata$Fingerling.fry.2.))
TSCFdata$Large.foodsize.6. <- as.numeric(gsub(",","",TSCFdata$Large.foodsize.6.))
TSCFdata$Medium.foodsize.5. <- as.numeric(gsub(",","",TSCFdata$Medium.foodsize.5.))
TSCFdata$Small.foodsize.4. <- as.numeric(gsub(",","",TSCFdata$Small.foodsize.4.))
TSCFdata$Stockers.3. <- as.numeric(gsub(",","",TSCFdata$Stockers.3.))
TSCFData1 <- mutate(TSCFdata,Total = rowSums(TSCFdata[,2:7]))
ggplot(TSCFData1,aes(x = reorder(Year,Total) , y = Total))+ geom_line(linetype = "dotted") + geom_point(shape = 20,size = 4) + ggtitle("Year wise comparison") + labs(x = "Year", y = "Total Inventory Amount",srt = 45)+ coord_flip()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

Obeservation shows that year 2016 resulted into much less inventory amount for total catfish purchase.
III) Dataset is from Albina Gallyavova’s post on FED CP rates
Attached file has Federal Reserve Board commercial paper rates with essentially several header rows (extract with few . Perhaps, columns should be split and organized by types (financial/non-financial/asset-backed/etc.) and tenor (overnight/7-day/15-day etc.), depending on intention of analysis. Could be useful to look at the patterns in different buckets, perform basic statistical analysis, and estimate future rates.
Loading and getting data in order
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
dataset <- read.csv("FRB_CP.csv",stringsAsFactors = FALSE)
dataset[dataset == 'ND'] <- 0
dataset <- na.omit(dataset)
dataset <- dataset[-1:-4,]
Renaming data columns.
names(dataset) <- c("Date","0D-AA-NFC","7D-AA-NFC","15D-AA-NFC","30D-AA-NFC","60D-AA-NFC","90D-AA-NFC","0D-A2-P2-NFC","7D-A2-P2-NFC","15D-A2-P2-NFC","30D-A2-P2-NFC","60D-A2-P2-NFC","90D-A2-P2-NFC","0D-AA-FC","7D-AA-FC","15D-AA-FC","30D-AA-FC","60D-AA-FC","90D-AA-FC","0D-AA-ABC","7D-AA-ABC","15D-AA-ABC","30D-AA-ABC","60D-AA-ABC","90D-AA-ABC")
Seperating ‘data’ to analyze on basis of year.
dataset1 <- dataset %>%
separate(Date,into = c("month","day","Year"),sep = "/")
dataset1 <- dataset1[,-1:-2]
dataset1$Year <- as.numeric(dataset1$Year)
head(dataset1)
## Year 0D-AA-NFC 7D-AA-NFC 15D-AA-NFC 30D-AA-NFC 60D-AA-NFC 90D-AA-NFC
## 6 16 0.36 0.35 0.38 0.42 0.48 0.54
## 7 16 0.35 0.35 0.37 0.42 0.47 0.53
## 8 16 0.36 0.38 0.41 0.43 0.49 0.54
## 9 16 0.37 0.35 0.37 0.4 0.46 0.52
## 10 16 0.37 0.38 0.39 0.41 0.47 0.53
## 11 16 0.36 0.35 0.52 0.42 0.46 0.52
## 0D-A2-P2-NFC 7D-A2-P2-NFC 15D-A2-P2-NFC 30D-A2-P2-NFC 60D-A2-P2-NFC
## 6 0.58 0.69 0.66 0.71 0.86
## 7 0.59 0.66 0.67 0.85 0.83
## 8 0.59 0.65 0.64 0.8 0.85
## 9 0.59 0.64 0.68 0.76 0.8
## 10 0.59 0.64 0.69 0.77 0.92
## 11 0.58 0.63 0.71 0.81 0.94
## 90D-A2-P2-NFC 0D-AA-FC 7D-AA-FC 15D-AA-FC 30D-AA-FC 60D-AA-FC 90D-AA-FC
## 6 0 0.4 0 0 0 0.62 0.87
## 7 0 0.4 0.39 0.4 0.42 0.59 0.76
## 8 1 0.4 0.42 0.45 0.51 0.64 0.77
## 9 0 0.4 0.41 0.42 0.45 0.58 0.73
## 10 1.07 0.4 0.41 0.41 0.43 0.61 0.79
## 11 0 0.4 0.42 0 0.49 0.62 0.84
## 0D-AA-ABC 7D-AA-ABC 15D-AA-ABC 30D-AA-ABC 60D-AA-ABC 90D-AA-ABC
## 6 0.46 0.48 0.52 0.61 0.79 0.95
## 7 0.46 0.48 0.49 0.6 0.7 0.91
## 8 0.46 0.51 0.52 0.63 0.76 0.89
## 9 0.47 0.48 0.53 0.63 0.77 0.92
## 10 0.46 0.51 0.56 0.66 0.73 0.9
## 11 0.47 0.48 0.56 0.57 0.76 0.94
Goruping with respect to year
dataset1 <- data.frame(sapply(dataset1, function(x) as.numeric(as.character(x))))
dataset2 <- dataset1 %>%
group_by(Year)%>%
summarise_all(funs(sum))
head(dataset2)
## # A tibble: 2 x 25
## Year X0D.AA.NFC X7D.AA.NFC X15D.AA.NFC X30D.AA.NFC X60D.AA.NFC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 16 29.11 29.44 30.51 34.20 35.54
## 2 17 159.85 158.46 159.23 165.28 167.01
## # ... with 19 more variables: X90D.AA.NFC <dbl>, X0D.A2.P2.NFC <dbl>,
## # X7D.A2.P2.NFC <dbl>, X15D.A2.P2.NFC <dbl>, X30D.A2.P2.NFC <dbl>,
## # X60D.A2.P2.NFC <dbl>, X90D.A2.P2.NFC <dbl>, X0D.AA.FC <dbl>,
## # X7D.AA.FC <dbl>, X15D.AA.FC <dbl>, X30D.AA.FC <dbl>, X60D.AA.FC <dbl>,
## # X90D.AA.FC <dbl>, X0D.AA.ABC <dbl>, X7D.AA.ABC <dbl>,
## # X15D.AA.ABC <dbl>, X30D.AA.ABC <dbl>, X60D.AA.ABC <dbl>,
## # X90D.AA.ABC <dbl>
Graph depicting yearwise change in rates of paper.
dataset3 <- melt(dataset2,id.vars = "Year",measure.vars = grep("^X",names(dataset2),value = TRUE))
ggplot(dataset3,aes(x = variable,y = value,color = as.factor(Year)))+ geom_point() +
geom_line(aes(group = Year)) +
coord_cartesian(ylim = c(1, 100)) + coord_flip()

for(i in 1:2) {
dataset2$total[i] <- rowSums(dataset2[i,2:25],na.rm = TRUE)
}
## Warning: Unknown or uninitialised column: 'total'.
Percentage increase from 2016 to 2017
percent_increase <- (671.13-208.02)/(208.02)
percent_increase*100
## [1] 222.6276
dataset2[3,] = c("Total", colSums(dataset2[,2:26]))
head(dataset2)
## # A tibble: 3 x 26
## Year X0D.AA.NFC X7D.AA.NFC X15D.AA.NFC X30D.AA.NFC X60D.AA.NFC
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 16 29.11 29.44 30.51 34.2 35.54
## 2 17 159.85 158.46 159.23 165.28 167.01
## 3 Total 188.96 187.9 189.74 199.48 202.55
## # ... with 20 more variables: X90D.AA.NFC <chr>, X0D.A2.P2.NFC <chr>,
## # X7D.A2.P2.NFC <chr>, X15D.A2.P2.NFC <chr>, X30D.A2.P2.NFC <chr>,
## # X60D.A2.P2.NFC <chr>, X90D.A2.P2.NFC <chr>, X0D.AA.FC <chr>,
## # X7D.AA.FC <chr>, X15D.AA.FC <chr>, X30D.AA.FC <chr>, X60D.AA.FC <chr>,
## # X90D.AA.FC <chr>, X0D.AA.ABC <chr>, X7D.AA.ABC <chr>,
## # X15D.AA.ABC <chr>, X30D.AA.ABC <chr>, X60D.AA.ABC <chr>,
## # X90D.AA.ABC <chr>, total <chr>
I added all rows and columns to calculate Financial,Non-financial and asset blocked rates seperately.depending upon what rates you want one can just add up the columns and take the sum.I primarily focused on how to rates changed within two years which is represented in graph above.