yoga_url <- "https://raw.githubusercontent.com/emrahakin1985/DATA607/master/datasets/yoga.csv"
yoga <- read.csv(yoga_url, stringsAsFactors = F)
yoga <- yoga[2:nrow(yoga),]
row.names(yoga) <- 1:nrow(yoga)
colnames(yoga)[1] <- "date"
knitr::kable(head(yoga, 15))
date | Alabama..us.al. | Alaska..us.ak. | Arizona..us.az. | Arkansas..us.ar. | California..us.ca. | Colorado..us.co. | Connecticut..us.ct. | Delaware..us.de. | District.of.Columbia..us.dc. | Florida..us.fl. | Georgia..us.ga. | Hawaii..us.hi. | Idaho..us.id. | Illinois..us.il. | Indiana..us.in. | Iowa..us.ia. | Kansas..us.ks. | Kentucky..us.ky. | Louisiana..us.la. | Maine..us.me. | Maryland..us.md. | Massachusetts..us.ma. | Michigan..us.mi. | Minnesota..us.mn. | Mississippi..us.ms. | Missouri..us.mo. | Montana..us.mt. | Nebraska..us.ne. | Nevada..us.nv. | New.Hampshire..us.nh. | New.Jersey..us.nj. | New.Mexico..us.nm. | New.York..us.ny. | North.Carolina..us.nc. | North.Dakota..us.nd. | Ohio..us.oh. | Oklahoma..us.ok. | Oregon..us.or. | Pennsylvania..us.pa. | Rhode.Island..us.ri. | South.Carolina..us.sc. | South.Dakota..us.sd. | Tennessee..us.tn. | Texas..us.tx. | Utah..us.ut. | Vermont..us.vt. | Virginia..us.va. | Washington..us.wa. | West.Virginia..us.wv. | Wisconsin..us.wi. | Wyoming..us.wy. |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2004-01 | 20 | 23 | 21 | 24 | 32 | 33 | 27 | 47 | 32 | 21 | 21 | 36 | 21 | 25 | 24 | 14 | 20 | 17 | 20 | 29 | 26 | 41 | 19 | 26 | 16 | 19 | 44 | 15 | 20 | 45 | 27 | 33 | 35 | 23 | 52 | 19 | 22 | 34 | 19 | 44 | 24 | 25 | 21 | 24 | 26 | 42 | 22 | 30 | 23 | 18 | 0 |
2004-02 | 8 | 26 | 25 | 16 | 27 | 30 | 26 | 28 | 36 | 17 | 20 | 24 | 22 | 23 | 14 | 16 | 12 | 19 | 15 | 29 | 23 | 33 | 18 | 22 | 20 | 18 | 26 | 21 | 25 | 20 | 22 | 25 | 28 | 22 | 45 | 16 | 19 | 30 | 18 | 26 | 19 | 22 | 18 | 16 | 20 | 39 | 16 | 29 | 17 | 17 | 37 |
2004-03 | 10 | 26 | 22 | 26 | 28 | 29 | 30 | 51 | 29 | 17 | 20 | 36 | 21 | 25 | 17 | 18 | 13 | 18 | 17 | 26 | 22 | 32 | 17 | 21 | 18 | 15 | 41 | 16 | 24 | 22 | 23 | 26 | 29 | 20 | 45 | 15 | 12 | 28 | 20 | 27 | 18 | 21 | 16 | 17 | 10 | 41 | 19 | 27 | 27 | 20 | 35 |
2004-04 | 15 | 34 | 24 | 18 | 25 | 27 | 22 | 27 | 29 | 19 | 15 | 30 | 18 | 17 | 14 | 19 | 19 | 12 | 20 | 21 | 17 | 31 | 15 | 17 | 18 | 12 | 25 | 16 | 17 | 26 | 20 | 18 | 26 | 18 | 45 | 13 | 11 | 22 | 19 | 31 | 14 | 21 | 21 | 16 | 20 | 37 | 17 | 25 | 26 | 17 | 37 |
2004-05 | 15 | 14 | 24 | 11 | 25 | 24 | 25 | 20 | 24 | 20 | 16 | 39 | 14 | 17 | 9 | 14 | 15 | 15 | 14 | 25 | 21 | 27 | 15 | 22 | 13 | 15 | 24 | 14 | 25 | 23 | 22 | 21 | 28 | 16 | 45 | 16 | 17 | 25 | 16 | 26 | 11 | 28 | 15 | 17 | 19 | 31 | 18 | 25 | 21 | 18 | 35 |
2004-06 | 12 | 21 | 23 | 14 | 27 | 27 | 26 | 22 | 26 | 18 | 18 | 44 | 17 | 20 | 18 | 17 | 18 | 17 | 16 | 22 | 23 | 29 | 17 | 18 | 18 | 20 | 21 | 15 | 23 | 27 | 21 | 32 | 28 | 22 | 45 | 16 | 17 | 27 | 20 | 33 | 19 | 28 | 16 | 17 | 16 | 34 | 22 | 27 | 28 | 19 | 35 |
2004-07 | 13 | 17 | 25 | 15 | 28 | 30 | 22 | 34 | 28 | 18 | 20 | 29 | 19 | 20 | 19 | 15 | 19 | 15 | 13 | 32 | 29 | 36 | 16 | 18 | 12 | 16 | 21 | 17 | 15 | 20 | 23 | 25 | 31 | 22 | 28 | 14 | 15 | 30 | 19 | 34 | 19 | 29 | 19 | 19 | 21 | 32 | 21 | 28 | 23 | 18 | 34 |
2004-08 | 11 | 17 | 20 | 15 | 28 | 27 | 27 | 26 | 31 | 17 | 19 | 31 | 26 | 21 | 17 | 14 | 17 | 20 | 16 | 30 | 21 | 36 | 17 | 18 | 12 | 21 | 22 | 19 | 24 | 28 | 22 | 31 | 31 | 20 | 31 | 14 | 11 | 24 | 18 | 23 | 18 | 29 | 13 | 17 | 18 | 35 | 23 | 27 | 32 | 17 | 26 |
2004-09 | 10 | 23 | 20 | 10 | 26 | 25 | 24 | 18 | 25 | 16 | 17 | 32 | 21 | 19 | 18 | 13 | 15 | 15 | 14 | 25 | 21 | 32 | 18 | 17 | 13 | 15 | 22 | 13 | 20 | 21 | 23 | 19 | 30 | 21 | 33 | 13 | 13 | 27 | 18 | 37 | 14 | 28 | 13 | 15 | 25 | 28 | 20 | 26 | 41 | 15 | 23 |
2004-10 | 13 | 23 | 22 | 18 | 25 | 30 | 19 | 35 | 20 | 16 | 18 | 36 | 15 | 16 | 13 | 19 | 15 | 12 | 12 | 26 | 24 | 29 | 16 | 15 | 15 | 15 | 35 | 15 | 14 | 22 | 20 | 14 | 26 | 17 | 20 | 13 | 11 | 28 | 16 | 32 | 16 | 28 | 18 | 15 | 20 | 49 | 15 | 25 | 14 | 20 | 23 |
2004-11 | 15 | 19 | 26 | 16 | 25 | 26 | 24 | 25 | 23 | 15 | 16 | 37 | 26 | 18 | 14 | 15 | 13 | 18 | 12 | 22 | 22 | 26 | 16 | 18 | 11 | 17 | 27 | 16 | 23 | 28 | 21 | 26 | 28 | 17 | 21 | 17 | 9 | 25 | 18 | 32 | 16 | 28 | 17 | 13 | 20 | 39 | 16 | 25 | 14 | 15 | 32 |
2004-12 | 9 | 24 | 19 | 21 | 25 | 29 | 25 | 23 | 23 | 16 | 17 | 34 | 24 | 18 | 16 | 18 | 15 | 12 | 14 | 26 | 21 | 32 | 15 | 21 | 16 | 15 | 23 | 12 | 22 | 28 | 20 | 23 | 27 | 18 | 19 | 15 | 13 | 24 | 17 | 30 | 13 | 28 | 18 | 13 | 21 | 34 | 16 | 24 | 14 | 16 | 25 |
2005-01 | 12 | 19 | 29 | 25 | 30 | 32 | 32 | 37 | 28 | 19 | 23 | 43 | 23 | 20 | 17 | 20 | 19 | 15 | 19 | 36 | 27 | 36 | 22 | 20 | 21 | 19 | 29 | 16 | 26 | 35 | 22 | 36 | 32 | 24 | 17 | 18 | 15 | 24 | 20 | 39 | 20 | 21 | 21 | 17 | 20 | 49 | 23 | 27 | 21 | 24 | 35 |
2005-02 | 12 | 12 | 20 | 19 | 27 | 28 | 23 | 19 | 26 | 16 | 17 | 28 | 18 | 20 | 20 | 18 | 15 | 15 | 16 | 29 | 16 | 31 | 19 | 20 | 16 | 15 | 21 | 12 | 26 | 34 | 23 | 26 | 28 | 20 | 21 | 17 | 18 | 28 | 18 | 32 | 17 | 16 | 16 | 16 | 21 | 35 | 20 | 26 | 22 | 15 | 29 |
2005-03 | 13 | 20 | 21 | 16 | 26 | 29 | 31 | 17 | 28 | 16 | 17 | 27 | 12 | 19 | 17 | 17 | 16 | 15 | 16 | 31 | 21 | 29 | 15 | 17 | 12 | 15 | 16 | 13 | 14 | 37 | 22 | 34 | 30 | 15 | 16 | 16 | 12 | 27 | 19 | 30 | 17 | 14 | 17 | 16 | 19 | 44 | 20 | 25 | 21 | 19 | 32 |
Date column separated into year and month:
yoga2 <- yoga %>%
separate(date, into = c("year", "month"), sep = "-")
head(yoga2 %>%
select(1,2, 3:5), 10)
## year month Alabama..us.al. Alaska..us.ak. Arizona..us.az.
## 1 2004 01 20 23 21
## 2 2004 02 8 26 25
## 3 2004 03 10 26 22
## 4 2004 04 15 34 24
## 5 2004 05 15 14 24
## 6 2004 06 12 21 23
## 7 2004 07 13 17 25
## 8 2004 08 11 17 20
## 9 2004 09 10 23 20
## 10 2004 10 13 23 22
Extracting state names:
c <- colnames(yoga)
c
## [1] "date" "Alabama..us.al."
## [3] "Alaska..us.ak." "Arizona..us.az."
## [5] "Arkansas..us.ar." "California..us.ca."
## [7] "Colorado..us.co." "Connecticut..us.ct."
## [9] "Delaware..us.de." "District.of.Columbia..us.dc."
## [11] "Florida..us.fl." "Georgia..us.ga."
## [13] "Hawaii..us.hi." "Idaho..us.id."
## [15] "Illinois..us.il." "Indiana..us.in."
## [17] "Iowa..us.ia." "Kansas..us.ks."
## [19] "Kentucky..us.ky." "Louisiana..us.la."
## [21] "Maine..us.me." "Maryland..us.md."
## [23] "Massachusetts..us.ma." "Michigan..us.mi."
## [25] "Minnesota..us.mn." "Mississippi..us.ms."
## [27] "Missouri..us.mo." "Montana..us.mt."
## [29] "Nebraska..us.ne." "Nevada..us.nv."
## [31] "New.Hampshire..us.nh." "New.Jersey..us.nj."
## [33] "New.Mexico..us.nm." "New.York..us.ny."
## [35] "North.Carolina..us.nc." "North.Dakota..us.nd."
## [37] "Ohio..us.oh." "Oklahoma..us.ok."
## [39] "Oregon..us.or." "Pennsylvania..us.pa."
## [41] "Rhode.Island..us.ri." "South.Carolina..us.sc."
## [43] "South.Dakota..us.sd." "Tennessee..us.tn."
## [45] "Texas..us.tx." "Utah..us.ut."
## [47] "Vermont..us.vt." "Virginia..us.va."
## [49] "Washington..us.wa." "West.Virginia..us.wv."
## [51] "Wisconsin..us.wi." "Wyoming..us.wy."
c2 <- unlist(str_extract_all(c, ".+\\.{2}"))
c2
## [1] "Alabama.." "Alaska.."
## [3] "Arizona.." "Arkansas.."
## [5] "California.." "Colorado.."
## [7] "Connecticut.." "Delaware.."
## [9] "District.of.Columbia.." "Florida.."
## [11] "Georgia.." "Hawaii.."
## [13] "Idaho.." "Illinois.."
## [15] "Indiana.." "Iowa.."
## [17] "Kansas.." "Kentucky.."
## [19] "Louisiana.." "Maine.."
## [21] "Maryland.." "Massachusetts.."
## [23] "Michigan.." "Minnesota.."
## [25] "Mississippi.." "Missouri.."
## [27] "Montana.." "Nebraska.."
## [29] "Nevada.." "New.Hampshire.."
## [31] "New.Jersey.." "New.Mexico.."
## [33] "New.York.." "North.Carolina.."
## [35] "North.Dakota.." "Ohio.."
## [37] "Oklahoma.." "Oregon.."
## [39] "Pennsylvania.." "Rhode.Island.."
## [41] "South.Carolina.." "South.Dakota.."
## [43] "Tennessee.." "Texas.."
## [45] "Utah.." "Vermont.."
## [47] "Virginia.." "Washington.."
## [49] "West.Virginia.." "Wisconsin.."
## [51] "Wyoming.."
c3 <- unlist(str_replace(c2, "\\.\\.", ""))
c3
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District.of.Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New.Hampshire"
## [31] "New.Jersey" "New.Mexico" "New.York"
## [34] "North.Carolina" "North.Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode.Island" "South.Carolina" "South.Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "West.Virginia" "Wisconsin" "Wyoming"
c4 <- unlist(str_replace_all(c3, "\\.", " "))
c4
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District of Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New Hampshire"
## [31] "New Jersey" "New Mexico" "New York"
## [34] "North Carolina" "North Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode Island" "South Carolina" "South Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "West Virginia" "Wisconsin" "Wyoming"
Adding columns with state names:
names(yoga2) <- c(names(yoga2)[1:2], c4)
Gathering all state columns:
yoga3 <- yoga2 %>%
gather(State, Yoga_Search, -year, -month)
head(yoga3, 15)
## year month State Yoga_Search
## 1 2004 01 Alabama 20
## 2 2004 02 Alabama 8
## 3 2004 03 Alabama 10
## 4 2004 04 Alabama 15
## 5 2004 05 Alabama 15
## 6 2004 06 Alabama 12
## 7 2004 07 Alabama 13
## 8 2004 08 Alabama 11
## 9 2004 09 Alabama 10
## 10 2004 10 Alabama 13
## 11 2004 11 Alabama 15
## 12 2004 12 Alabama 9
## 13 2005 01 Alabama 12
## 14 2005 02 Alabama 12
## 15 2005 03 Alabama 13
Yoga searches grouped by states:
yoga_search <- yoga3 %>%
group_by(State) %>%
summarise(Yoga_Search = sum(Yoga_Search)) %>%
arrange(desc(Yoga_Search))
#top10
head(yoga_search, 10)
## # A tibble: 10 x 2
## State Yoga_Search
## <chr> <int>
## 1 Vermont 7529
## 2 Hawaii 5699
## 3 Rhode Island 5546
## 4 New Hampshire 5292
## 5 Maine 5251
## 6 Montana 4976
## 7 Wyoming 4815
## 8 Massachusetts 4763
## 9 Alaska 4726
## 10 Colorado 4578
#bottom10
tail(yoga_search, 10)
## # A tibble: 10 x 2
## State Yoga_Search
## <chr> <int>
## 1 Kansas 2532
## 2 Tennessee 2507
## 3 Texas 2466
## 4 Kentucky 2446
## 5 Louisiana 2440
## 6 Arkansas 2424
## 7 Oklahoma 2421
## 8 Georgia 2383
## 9 Mississippi 2241
## 10 Alabama 2042
Yoga search by year. The year 2016 seems to be wrong or incomplete, therefore excluded:
yoga_search2 <- yoga3 %>%
group_by(year) %>%
summarise(Yoga_Search = sum(Yoga_Search)) %>%
arrange((year)) %>%
filter(year != 2016)
yoga_search2
## # A tibble: 12 x 2
## year Yoga_Search
## <chr> <int>
## 1 2004 13467
## 2 2005 13173
## 3 2006 12952
## 4 2007 12353
## 5 2008 12085
## 6 2009 12671
## 7 2010 14874
## 8 2011 14197
## 9 2012 14875
## 10 2013 16350
## 11 2014 17442
## 12 2015 18475
Scatter plot showing the association between years and yoga search
ggplot(yoga_search2, aes(year, Yoga_Search) ) + geom_point(color = "blue")