Assignment

• Places to start: kaggle.com, ourworldindata.org, data.gov…

• Find two examples of un-tidy datasets and two of tidy datasets.

• In R tidy the un-tidy, be prepared to discuss in class next week.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(dplyr)

Untidy Datasets

Japan’s Population Density

Source: https://www.kaggle.com/leedatawhiz/untidy-japanese-prefecture-2015-population-density

JAPN_Population <- read_csv("JAPN_Population.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double()
## )
## ℹ Use `spec()` for the full column specifications.
str(JAPN_Population)
## spec_tbl_df [1 × 47] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Aichi    : num 1447
##  $ Akita    : num 87.9
##  $ Aomori   : num 136
##  $ Chiba    : num 1207
##  $ Ehime    : num 244
##  $ Fukui    : num 188
##  $ Fukuoka  : num 1023
##  $ Fukushima: num 139
##  $ Gifu     : num 191
##  $ Gumma    : num 310
##  $ Hiroshima: num 336
##  $ Hokkaido : num 68.6
##  $ Hyogo    : num 659
##  $ Ibaraki  : num 479
##  $ Ishikawa : num 276
##  $ Iwate    : num 83.8
##  $ Kagawa   : num 520
##  $ Kagoshima: num 179
##  $ Kanagawa : num 3778
##  $ Kochi    : num 102
##  $ Kumamoto : num 241
##  $ Kyoto    : num 566
##  $ Mie      : num 314
##  $ Miyagi   : num 320
##  $ Miyazaki : num 143
##  $ Nagano   : num 155
##  $ Nagasaki : num 333
##  $ Nara     : num 370
##  $ Niigata  : num 183
##  $ Oita     : num 184
##  $ Okayama  : num 270
##  $ Okinawa  : num 629
##  $ Osaka    : num 4640
##  $ Saga     : num 341
##  $ Saitama  : num 1912
##  $ Shiga    : num 352
##  $ Shimane  : num 104
##  $ Shizuoka : num 476
##  $ Tochigi  : num 308
##  $ Tokushima: num 182
##  $ Tokyo    : num 6168
##  $ Tottori  : num 164
##  $ Toyama   : num 251
##  $ Wakayama : num 204
##  $ Yamagata : num 120
##  $ Yamaguchi: num 230
##  $ Yamanashi: num 187
##  - attr(*, "spec")=
##   .. cols(
##   ..   Aichi = col_double(),
##   ..   Akita = col_double(),
##   ..   Aomori = col_double(),
##   ..   Chiba = col_double(),
##   ..   Ehime = col_double(),
##   ..   Fukui = col_double(),
##   ..   Fukuoka = col_double(),
##   ..   Fukushima = col_double(),
##   ..   Gifu = col_double(),
##   ..   Gumma = col_double(),
##   ..   Hiroshima = col_double(),
##   ..   Hokkaido = col_double(),
##   ..   Hyogo = col_double(),
##   ..   Ibaraki = col_double(),
##   ..   Ishikawa = col_double(),
##   ..   Iwate = col_double(),
##   ..   Kagawa = col_double(),
##   ..   Kagoshima = col_double(),
##   ..   Kanagawa = col_double(),
##   ..   Kochi = col_double(),
##   ..   Kumamoto = col_double(),
##   ..   Kyoto = col_double(),
##   ..   Mie = col_double(),
##   ..   Miyagi = col_double(),
##   ..   Miyazaki = col_double(),
##   ..   Nagano = col_double(),
##   ..   Nagasaki = col_double(),
##   ..   Nara = col_double(),
##   ..   Niigata = col_double(),
##   ..   Oita = col_double(),
##   ..   Okayama = col_double(),
##   ..   Okinawa = col_double(),
##   ..   Osaka = col_double(),
##   ..   Saga = col_double(),
##   ..   Saitama = col_double(),
##   ..   Shiga = col_double(),
##   ..   Shimane = col_double(),
##   ..   Shizuoka = col_double(),
##   ..   Tochigi = col_double(),
##   ..   Tokushima = col_double(),
##   ..   Tokyo = col_double(),
##   ..   Tottori = col_double(),
##   ..   Toyama = col_double(),
##   ..   Wakayama = col_double(),
##   ..   Yamagata = col_double(),
##   ..   Yamaguchi = col_double(),
##   ..   Yamanashi = col_double()
##   .. )
JAPN_Population %>%
  pivot_longer(cols = 1:47, names_to = "Cities", values_to = "Density") 
## # A tibble: 47 x 2
##    Cities    Density
##    <chr>       <dbl>
##  1 Aichi      1447. 
##  2 Akita        87.9
##  3 Aomori      136. 
##  4 Chiba      1207. 
##  5 Ehime       244. 
##  6 Fukui       188. 
##  7 Fukuoka    1023. 
##  8 Fukushima   139. 
##  9 Gifu        191. 
## 10 Gumma       310. 
## # … with 37 more rows
head(JAPN_Population)
## # A tibble: 1 x 47
##   Aichi Akita Aomori Chiba Ehime Fukui Fukuoka Fukushima  Gifu Gumma Hiroshima
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>     <dbl> <dbl> <dbl>     <dbl>
## 1 1447.  87.9   136. 1207.  244.  188.   1023.      139.  191.  310.      336.
## # … with 36 more variables: Hokkaido <dbl>, Hyogo <dbl>, Ibaraki <dbl>,
## #   Ishikawa <dbl>, Iwate <dbl>, Kagawa <dbl>, Kagoshima <dbl>, Kanagawa <dbl>,
## #   Kochi <dbl>, Kumamoto <dbl>, Kyoto <dbl>, Mie <dbl>, Miyagi <dbl>,
## #   Miyazaki <dbl>, Nagano <dbl>, Nagasaki <dbl>, Nara <dbl>, Niigata <dbl>,
## #   Oita <dbl>, Okayama <dbl>, Okinawa <dbl>, Osaka <dbl>, Saga <dbl>,
## #   Saitama <dbl>, Shiga <dbl>, Shimane <dbl>, Shizuoka <dbl>, Tochigi <dbl>,
## #   Tokushima <dbl>, Tokyo <dbl>, Tottori <dbl>, Toyama <dbl>, Wakayama <dbl>,
## #   Yamagata <dbl>, Yamaguchi <dbl>, Yamanashi <dbl>

MPG Dataset

Source: https://fueleconomy.gov/.

mpg %>%
  group_by(manufacturer) %>%
  count(class, name ="n")%>%
  pivot_wider(names_from = class, values_from = n, values_fill = 0)
## # A tibble: 15 x 8
## # Groups:   manufacturer [15]
##    manufacturer compact midsize `2seater`   suv minivan pickup subcompact
##    <chr>          <int>   <int>     <int> <int>   <int>  <int>      <int>
##  1 audi              15       3         0     0       0      0          0
##  2 chevrolet          0       5         5     9       0      0          0
##  3 dodge              0       0         0     7      11     19          0
##  4 ford               0       0         0     9       0      7          9
##  5 honda              0       0         0     0       0      0          9
##  6 hyundai            0       7         0     0       0      0          7
##  7 jeep               0       0         0     8       0      0          0
##  8 land rover         0       0         0     4       0      0          0
##  9 lincoln            0       0         0     3       0      0          0
## 10 mercury            0       0         0     4       0      0          0
## 11 nissan             2       7         0     4       0      0          0
## 12 pontiac            0       5         0     0       0      0          0
## 13 subaru             4       0         0     6       0      0          4
## 14 toyota            12       7         0     8       0      7          0
## 15 volkswagen        14       7         0     0       0      0          6
head(mpg)
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…

Tidy Datasets

Artists Data

Source:https://www.kaggle.com/ikarus777/best-artworks-of-all-time

Artists <- read_csv("Artists.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   id = col_double(),
##   name = col_character(),
##   years = col_character(),
##   genre = col_character(),
##   nationality = col_character(),
##   bio = col_character(),
##   wikipedia = col_character(),
##   paintings = col_double()
## )
head(Artists)
## # A tibble: 6 x 8
##      id name    years  genre   nationality bio           wikipedia     paintings
##   <dbl> <chr>   <chr>  <chr>   <chr>       <chr>         <chr>             <dbl>
## 1     0 Amedeo… 1884 … Expres… Italian     "Amedeo Clem… http://en.wi…       193
## 2     1 Vasili… 1866 … Expres… Russian     "Wassily Was… http://en.wi…        88
## 3     2 Diego … 1886 … Social… Mexican     "Diego María… http://en.wi…        70
## 4     3 Claude… 1840 … Impres… French      "Oscar-Claud… http://en.wi…        73
## 5     4 Rene M… 1898 … Surrea… Belgian     "René Franço… http://en.wi…       194
## 6     5 Salvad… 1904 … Surrea… Spanish     "Salvador Do… http://en.wi…       139

Chocolate Bar Ratings

Source: https://www.kaggle.com/rtatman/chocolate-bar-ratings

Chocolate <- read_csv("Chocolate.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Company 
## (Maker-if known)` = col_character(),
##   `Specific Bean Origin
## or Bar Name` = col_character(),
##   REF = col_double(),
##   `Review
## Date` = col_double(),
##   `Cocoa
## Percent` = col_character(),
##   `Company
## Location` = col_character(),
##   Rating = col_double(),
##   `Bean
## Type` = col_character(),
##   `Broad Bean
## Origin` = col_character()
## )
head(Chocolate)
## # A tibble: 6 x 9
##   `Company \n(Mak… `Specific Bean …   REF `Review\nDate` `Cocoa\nPercent`
##   <chr>            <chr>            <dbl>          <dbl> <chr>           
## 1 A. Morin         Agua Grande       1876           2016 63%             
## 2 A. Morin         Kpime             1676           2015 70%             
## 3 A. Morin         Atsane            1676           2015 70%             
## 4 A. Morin         Akata             1680           2015 70%             
## 5 A. Morin         Quilla            1704           2015 70%             
## 6 A. Morin         Carenero          1315           2014 70%             
## # … with 4 more variables: `Company\nLocation` <chr>, Rating <dbl>,
## #   `Bean\nType` <chr>, `Broad Bean\nOrigin` <chr>