Starting up

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(knitr)

Some useful functions:

filter() - selection of records (rows) based on pattern
select() - selection of columns based on pattern
mutate() - add new column
group_by() - Establish the order of priority
first(), last() - select end records
separate(): break into multiple columns
unite(): gather multiple columns into a single columns

CHALLENGE

Which of these tables is a tidy dataset?

Table 1: multiple columns per record

table1 = tribble(
 ~country, ~year, ~cases, ~population,
 # -------/-----/-------/----------
"Afghanistan", 1999, 745, 19987071,
"Afghanistan", 2000, 2666, 20595360,
"Brazil", 1999, 37737, 172006362,
"Brazil", 2000, 80488, 174504898,
"China", 1999, 212258, 1272915272,
"China", 2000, 213766, 1280428583)

Table 2: data spread across 2 rows

table2 = tribble(
~country, ~year, ~type, ~count,
# -------/------/------/----------
"Afghanistan", 1999, "cases", 745,
"Afghanistan", 1999, "population", 19987071,
"Afghanistan", 2000, "cases", 2666,
"Afghanistan", 2000, "population", 20595360,
"Brazil", 1999, "cases", 37737,
"Brazil", 1999, "population", 172006362,
"Brazil", 2000, "cases",  80488,  
"Brazil", 2000, "population",  174504898, 
"China",  1999,  "cases", 212258, 
"China",  1999,  "population", 1272915272,
"China",  2000,  "cases", 213766,
"China",  2000,  "population", 1280428583)

Table 3 Multiple fields in one column

table3 = tribble(
 ~country, ~year, ~rate,
# --------/------/--------
"Afghanistan", 1999, "745/19987071",
"Afghanistan", 2000, "2666/20595360",
"Brazil", 1999, "37737/172006362",
"Brazil", 2000, "80488/174504898",
"China", 1999, "212258/1272915272",
"China", 2000, "213766/1280428583")

Table 4: Data spread across 2 datasets

table4 = tribble( #cases
~country, ~'1999', ~'2000',
# ------/-------/--------
"Afghanistan", 745, 2666,
"Brazil", 37737, 80488,
"China", 212258, 213766)

table4b = tribble(# population
~country, ~'1999', ~'2000',
# -------/--------/--------
"Afghanistan", 19987071, 20595360,
"Brazil", 172006362, 174504898,
"China", 1272915272, 1280428583)

Back

Basic Rules

Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.

Compute rate per 10,000

table1 %>% 
  mutate(rate = cases / population * 10000)

## # A tibble: 6 × 5
##   country      year  cases population  rate
##   <chr>       <dbl>  <dbl>      <dbl> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67

Compute cases per year

table1 %>%
    count(year, wt=cases)

## # A tibble: 2 × 2
##    year      n
##   <dbl>  <dbl>
## 1  1999 250740
## 2  2000 296920

Gathering columns

a4 = table4a %>%
   gather('1999','2000',
          key = "year",
          value = "cases")
a4

## # A tibble: 6 × 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

b4 = table4b %>%
   gather('1999','2000',
          key = "year",
          value = "population")
b4

## # A tibble: 6 × 3
##   country     year  population
##   <chr>       <chr>      <dbl>
## 1 Afghanistan 1999    19987071
## 2 Brazil      1999   172006362
## 3 China       1999  1272915272
## 4 Afghanistan 2000    20595360
## 5 Brazil      2000   174504898
## 6 China       2000  1280428583

Joining datasets together

left_join(a4,b4)

## Joining, by = c("country", "year")

## # A tibble: 6 × 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <dbl>
## 1 Afghanistan 1999     745   19987071
## 2 Brazil      1999   37737  172006362
## 3 China       1999  212258 1272915272
## 4 Afghanistan 2000    2666   20595360
## 5 Brazil      2000   80488  174504898
## 6 China       2000  213766 1280428583

Spreading an overloaded column

table2

## # A tibble: 12 × 4
##    country      year type            count
##    <chr>       <dbl> <chr>           <dbl>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

table2 %>%
  spread(key=type, value=count)

## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <dbl>  <dbl>      <dbl>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

Separating a column into multiple columns

table3

## # A tibble: 6 × 3
##   country      year rate             
##   <chr>       <dbl> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

table3 %>%
  separate(rate, into=c("cases","population"), sep="/")

## # A tibble: 6 × 4
##   country      year cases  population
##   <chr>       <dbl> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

Separate by character count

table5 = table3 %>%
  separate(year, into = c("century","yr"),sep=2)
table5

## # A tibble: 6 × 4
##   country     century yr    rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

Uniting 2 column together

table5 %>%
  unite(year,century,yr,sep = "")

## # A tibble: 6 × 3
##   country     year  rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

Homework

Download the WHO CSV file on Tuberculosis infections fromthe WHO tuberculosis morbity data at https://extranet.who.int/tme/generateCSV.asp?ds=provisional_notifications (see https://www.who.int/teams/global-tuberculosis-programme/data)
Create a Tidy dataset that contains the informations for these countries:
- Thailand
- Indonesia
- United States
- Laos
- China
Create a plot of the tuberculosis morbidity between Jan 2020 and Aug 2021. For all of these countries.

Data dictionary

Column lbl	Description	Column lbl	Description
1. country	Name of country	13. m_06	New Cases in Jun
2. iso2	2-digit country code	14. m_07	New Cases in Jul
3. iso3	3-digit country code	15. m_08	New Cases in Aug
4. iso_numeric	Numeric country code	16. m_09	New cases in Sept
5. g_whoregion	WHO Region	17. m_10	New Cases in Oct
6. year	Calendar year	18. m_11	New cases in Nov
7.report_frequency	Annual report freq	19. m_12	New Cases in Dec
8.report_coverage	Per cent coverage	20. q_1	New Cases in 1st quarter
9. m_02	New Cases in Feb	21. q_2	New Cases in 2nd quarter
10. m_03	New Cases in Mar	22. q_3	New Cases in 3rd quarter
11. m_04	New Cases in Apr	23. q_4	New Cases in 3rd quarter
12. m_05	New Cases in May

Possible Term Projects

The effect of COVID pandemic on the maintenance of airplanes used in the American domestic routines (USDOT On time flight data)
Counting number of kernel popped from a sound clip of popcorn as a method to choose the right temperature and oil (AUDIO clip)
Determining the effect of weather, vacations and time of day on the pedestrian travel across the Brooklyn Bridge (NY City open DATA)
Determining the noise levels from airplanes on the communities surrounding the Chiang Mai airport (flighttracker.com)
The effect of weekday, weather and vacation schedules on gas prices in the lower 48 states of the US (GasBuddy.com)
Comparison of the effect of COVID19 on household income and expenditures in each of the provinces of Thailand (data.go.th)
Identification of the areas of Thailand that have the most frequent forest fires during the hot season. NASA FIRMS
Gender and age discrimination in position and salaries in Baltimore city jobs (Baltimore Open Data)

CS 424 Big Data Analytics

Session 12: Tidying and Cleaning Data

R Studio Interface

Starting up

Workflow

Some useful functions:

CHALLENGE

Table 1: multiple columns per record

Table 2: data spread across 2 rows

Table 3 Multiple fields in one column

Table 4: Data spread across 2 datasets

Basic Rules

Compute rate per 10,000

Compute cases per year

Gathering columns

Joining datasets together

Spreading an overloaded column

Separating a column into multiple columns

Separate by character count

Uniting 2 column together

Homework

Data dictionary

Possible Term Projects