• Places to start: kaggle.com, ourworldindata.org, data.gov…
• Find two examples of un-tidy datasets and two of tidy datasets.
• In R tidy the un-tidy, be prepared to discuss in class next week.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(dplyr)
Source: https://www.kaggle.com/leedatawhiz/untidy-japanese-prefecture-2015-population-density
JAPN_Population <- read_csv("JAPN_Population.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
str(JAPN_Population)
## spec_tbl_df [1 × 47] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Aichi : num 1447
## $ Akita : num 87.9
## $ Aomori : num 136
## $ Chiba : num 1207
## $ Ehime : num 244
## $ Fukui : num 188
## $ Fukuoka : num 1023
## $ Fukushima: num 139
## $ Gifu : num 191
## $ Gumma : num 310
## $ Hiroshima: num 336
## $ Hokkaido : num 68.6
## $ Hyogo : num 659
## $ Ibaraki : num 479
## $ Ishikawa : num 276
## $ Iwate : num 83.8
## $ Kagawa : num 520
## $ Kagoshima: num 179
## $ Kanagawa : num 3778
## $ Kochi : num 102
## $ Kumamoto : num 241
## $ Kyoto : num 566
## $ Mie : num 314
## $ Miyagi : num 320
## $ Miyazaki : num 143
## $ Nagano : num 155
## $ Nagasaki : num 333
## $ Nara : num 370
## $ Niigata : num 183
## $ Oita : num 184
## $ Okayama : num 270
## $ Okinawa : num 629
## $ Osaka : num 4640
## $ Saga : num 341
## $ Saitama : num 1912
## $ Shiga : num 352
## $ Shimane : num 104
## $ Shizuoka : num 476
## $ Tochigi : num 308
## $ Tokushima: num 182
## $ Tokyo : num 6168
## $ Tottori : num 164
## $ Toyama : num 251
## $ Wakayama : num 204
## $ Yamagata : num 120
## $ Yamaguchi: num 230
## $ Yamanashi: num 187
## - attr(*, "spec")=
## .. cols(
## .. Aichi = col_double(),
## .. Akita = col_double(),
## .. Aomori = col_double(),
## .. Chiba = col_double(),
## .. Ehime = col_double(),
## .. Fukui = col_double(),
## .. Fukuoka = col_double(),
## .. Fukushima = col_double(),
## .. Gifu = col_double(),
## .. Gumma = col_double(),
## .. Hiroshima = col_double(),
## .. Hokkaido = col_double(),
## .. Hyogo = col_double(),
## .. Ibaraki = col_double(),
## .. Ishikawa = col_double(),
## .. Iwate = col_double(),
## .. Kagawa = col_double(),
## .. Kagoshima = col_double(),
## .. Kanagawa = col_double(),
## .. Kochi = col_double(),
## .. Kumamoto = col_double(),
## .. Kyoto = col_double(),
## .. Mie = col_double(),
## .. Miyagi = col_double(),
## .. Miyazaki = col_double(),
## .. Nagano = col_double(),
## .. Nagasaki = col_double(),
## .. Nara = col_double(),
## .. Niigata = col_double(),
## .. Oita = col_double(),
## .. Okayama = col_double(),
## .. Okinawa = col_double(),
## .. Osaka = col_double(),
## .. Saga = col_double(),
## .. Saitama = col_double(),
## .. Shiga = col_double(),
## .. Shimane = col_double(),
## .. Shizuoka = col_double(),
## .. Tochigi = col_double(),
## .. Tokushima = col_double(),
## .. Tokyo = col_double(),
## .. Tottori = col_double(),
## .. Toyama = col_double(),
## .. Wakayama = col_double(),
## .. Yamagata = col_double(),
## .. Yamaguchi = col_double(),
## .. Yamanashi = col_double()
## .. )
JAPN_Population %>%
pivot_longer(cols = 1:47, names_to = "Cities", values_to = "Density")
## # A tibble: 47 x 2
## Cities Density
## <chr> <dbl>
## 1 Aichi 1447.
## 2 Akita 87.9
## 3 Aomori 136.
## 4 Chiba 1207.
## 5 Ehime 244.
## 6 Fukui 188.
## 7 Fukuoka 1023.
## 8 Fukushima 139.
## 9 Gifu 191.
## 10 Gumma 310.
## # … with 37 more rows
head(JAPN_Population)
## # A tibble: 1 x 47
## Aichi Akita Aomori Chiba Ehime Fukui Fukuoka Fukushima Gifu Gumma Hiroshima
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1447. 87.9 136. 1207. 244. 188. 1023. 139. 191. 310. 336.
## # … with 36 more variables: Hokkaido <dbl>, Hyogo <dbl>, Ibaraki <dbl>,
## # Ishikawa <dbl>, Iwate <dbl>, Kagawa <dbl>, Kagoshima <dbl>, Kanagawa <dbl>,
## # Kochi <dbl>, Kumamoto <dbl>, Kyoto <dbl>, Mie <dbl>, Miyagi <dbl>,
## # Miyazaki <dbl>, Nagano <dbl>, Nagasaki <dbl>, Nara <dbl>, Niigata <dbl>,
## # Oita <dbl>, Okayama <dbl>, Okinawa <dbl>, Osaka <dbl>, Saga <dbl>,
## # Saitama <dbl>, Shiga <dbl>, Shimane <dbl>, Shizuoka <dbl>, Tochigi <dbl>,
## # Tokushima <dbl>, Tokyo <dbl>, Tottori <dbl>, Toyama <dbl>, Wakayama <dbl>,
## # Yamagata <dbl>, Yamaguchi <dbl>, Yamanashi <dbl>
Source: https://fueleconomy.gov/.
mpg %>%
group_by(manufacturer) %>%
count(class, name ="n")%>%
pivot_wider(names_from = class, values_from = n, values_fill = 0)
## # A tibble: 15 x 8
## # Groups: manufacturer [15]
## manufacturer compact midsize `2seater` suv minivan pickup subcompact
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 audi 15 3 0 0 0 0 0
## 2 chevrolet 0 5 5 9 0 0 0
## 3 dodge 0 0 0 7 11 19 0
## 4 ford 0 0 0 9 0 7 9
## 5 honda 0 0 0 0 0 0 9
## 6 hyundai 0 7 0 0 0 0 7
## 7 jeep 0 0 0 8 0 0 0
## 8 land rover 0 0 0 4 0 0 0
## 9 lincoln 0 0 0 3 0 0 0
## 10 mercury 0 0 0 4 0 0 0
## 11 nissan 2 7 0 4 0 0 0
## 12 pontiac 0 5 0 0 0 0 0
## 13 subaru 4 0 0 6 0 0 4
## 14 toyota 12 7 0 8 0 7 0
## 15 volkswagen 14 7 0 0 0 0 6
head(mpg)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa…
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa…
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa…
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa…
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa…
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compa…
Source:https://www.kaggle.com/ikarus777/best-artworks-of-all-time
Artists <- read_csv("Artists.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## id = col_double(),
## name = col_character(),
## years = col_character(),
## genre = col_character(),
## nationality = col_character(),
## bio = col_character(),
## wikipedia = col_character(),
## paintings = col_double()
## )
head(Artists)
## # A tibble: 6 x 8
## id name years genre nationality bio wikipedia paintings
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 0 Amedeo… 1884 … Expres… Italian "Amedeo Clem… http://en.wi… 193
## 2 1 Vasili… 1866 … Expres… Russian "Wassily Was… http://en.wi… 88
## 3 2 Diego … 1886 … Social… Mexican "Diego María… http://en.wi… 70
## 4 3 Claude… 1840 … Impres… French "Oscar-Claud… http://en.wi… 73
## 5 4 Rene M… 1898 … Surrea… Belgian "René Franço… http://en.wi… 194
## 6 5 Salvad… 1904 … Surrea… Spanish "Salvador Do… http://en.wi… 139
Source: https://www.kaggle.com/rtatman/chocolate-bar-ratings
Chocolate <- read_csv("Chocolate.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## `Company
## (Maker-if known)` = col_character(),
## `Specific Bean Origin
## or Bar Name` = col_character(),
## REF = col_double(),
## `Review
## Date` = col_double(),
## `Cocoa
## Percent` = col_character(),
## `Company
## Location` = col_character(),
## Rating = col_double(),
## `Bean
## Type` = col_character(),
## `Broad Bean
## Origin` = col_character()
## )
head(Chocolate)
## # A tibble: 6 x 9
## `Company \n(Mak… `Specific Bean … REF `Review\nDate` `Cocoa\nPercent`
## <chr> <chr> <dbl> <dbl> <chr>
## 1 A. Morin Agua Grande 1876 2016 63%
## 2 A. Morin Kpime 1676 2015 70%
## 3 A. Morin Atsane 1676 2015 70%
## 4 A. Morin Akata 1680 2015 70%
## 5 A. Morin Quilla 1704 2015 70%
## 6 A. Morin Carenero 1315 2014 70%
## # … with 4 more variables: `Company\nLocation` <chr>, Rating <dbl>,
## # `Bean\nType` <chr>, `Broad Bean\nOrigin` <chr>