# Load the tidyverse
library(tidyverse)
# Load the data file
heating <- read_csv("data/heating.csv")
# Reading first 6 records
heating
## # A tibble: 14 × 9
## Source `Under 25 years old` `25 to 29 years old` `30 to 34 years old`
## <chr> <chr> <chr> <chr>
## 1 Warm-air furn… 2546 5061 6701
## 2 Steam or hot … 326 672 926
## 3 Electric heat… 529 919 1106
## 4 Built-in elec… 280 430 471
## 5 Floor, wall, … 267 423 513
## 6 Room heaters … 15 29 66
## 7 Room heaters … 18 38 42
## 8 Portable elec… 122 112 167
## 9 Stoves 23 35 38
## 10 Fireplaces wi… 11 13 14
## 11 Fireplaces wi… 6 4 1
## 12 Cooking stove . Z .
## 13 Other 18 48 57
## 14 None 43 77 63
## # ℹ 5 more variables: `35 to 44 years old` <dbl>, `45 to 54 years old` <dbl>,
## # `55 to 64 years old` <dbl>, `65 to 74 years old` <dbl>,
## # `75 years old and over` <dbl>
# Converting wide to long format
heating <- heating %>%
gather(key="age", value="homes", -Source)
# Reading first 6 records
heating
## # A tibble: 112 × 3
## Source age homes
## <chr> <chr> <chr>
## 1 Warm-air furnace Under 25 ye… 2546
## 2 Steam or hot water system Under 25 ye… 326
## 3 Electric heat pump Under 25 ye… 529
## 4 Built-in electric units Under 25 ye… 280
## 5 Floor, wall, or other built-in hot-air units without ducts Under 25 ye… 267
## 6 Room heaters with flue Under 25 ye… 15
## 7 Room heaters without flue Under 25 ye… 18
## 8 Portable electric heaters Under 25 ye… 122
## 9 Stoves Under 25 ye… 23
## 10 Fireplaces with inserts Under 25 ye… 11
## # ℹ 102 more rows
# Look at a summary
summary(heating)
## Source age homes
## Length:112 Length:112 Length:112
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
# Attempt to convert homes from character to numeric
heating %>%
mutate(homes=as.numeric(homes))
## # A tibble: 112 × 3
## Source age homes
## <chr> <chr> <dbl>
## 1 Warm-air furnace Under 25 ye… 2546
## 2 Steam or hot water system Under 25 ye… 326
## 3 Electric heat pump Under 25 ye… 529
## 4 Built-in electric units Under 25 ye… 280
## 5 Floor, wall, or other built-in hot-air units without ducts Under 25 ye… 267
## 6 Room heaters with flue Under 25 ye… 15
## 7 Room heaters without flue Under 25 ye… 18
## 8 Portable electric heaters Under 25 ye… 122
## 9 Stoves Under 25 ye… 23
## 10 Fireplaces with inserts Under 25 ye… 11
## # ℹ 102 more rows
# Find those values
heating %>%
filter(is.na(as.numeric(homes)))
## # A tibble: 3 × 3
## Source age homes
## <chr> <chr> <chr>
## 1 Cooking stove Under 25 years old .
## 2 Cooking stove 25 to 29 years old Z
## 3 Cooking stove 30 to 34 years old .
# Replace with zeros
heating <- heating %>%
mutate(homes=ifelse(homes==".", 0, homes)) %>%
mutate(homes=ifelse(homes=="Z", 0, homes)) %>%
mutate(homes=as.numeric(homes)) %>%
filter(Source=="Cooking stove")
# Summary of the dataset
summary(heating)
## Source age homes
## Length:8 Length:8 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Median : 7.000
## Mean : 8.375
## 3rd Qu.:13.250
## Max. :22.000
# Load the data file
land <- read_csv("data/publiclands.csv")
# Read first 6 records of the dataset
land
## # A tibble: 42 × 2
## State PublicLandAcres
## <chr> <dbl>
## 1 Alabama 665000
## 2 Alaska 22083000
## 3 Arizona 11255000
## 4 Arkansas 2586000
## 5 California 20698000
## 6 Colorado 14509000
## 7 Florida 1153000
## 8 Georgia 865000
## 9 Idaho 20458000
## 10 Illinois 293000
## # ℹ 32 more rows
# Look at the data
summary(land)
## State PublicLandAcres
## Length:42 Min. : 16000
## Class :character 1st Qu.: 606250
## Mode :character Median : 1156000
## Mean : 4577905
## 3rd Qu.: 7592500
## Max. :22083000
# How many rows are there?
nrow(land)
## [1] 42
# How many unique states are there?
unique(land$State)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Florida" "Georgia"
## [9] "Idaho" "Illinois" "Indiana" "Kansas"
## [13] "Kentucky" "Louisiana" "Maine" "Michigan"
## [17] "Minnesota" "Mississippi" "Missouri" "Montana"
## [21] "Nebraska" "Nevada" "New Hampshire" "New Mexico"
## [25] "New York" "North Carolina" "North Dakota" "Ohio"
## [29] "Oklahoma" "Oregon" "Pennsylvania" "South Carolina"
## [33] "South Dakota" "Tennessee" "Texas" "Utah"
## [37] "Vermont" "Virginia" "Washington" "West Virginia"
## [41] "Wisconsin" "Wyoming"
# Insert missing states
missing_states <- tibble(State=c('Connecticut', 'Delaware', 'Hawaii', 'Iowa', 'Maryland',
'Massachusetts','New Jersey', 'Rhode Island'),
PublicLandAcres=c(0,0,0,0,0,0,0,0))
# Display missing states
missing_states
## # A tibble: 8 × 2
## State PublicLandAcres
## <chr> <dbl>
## 1 Connecticut 0
## 2 Delaware 0
## 3 Hawaii 0
## 4 Iowa 0
## 5 Maryland 0
## 6 Massachusetts 0
## 7 New Jersey 0
## 8 Rhode Island 0
# Concatenate two data tables
land <- rbind(land, missing_states)
land
## # A tibble: 50 × 2
## State PublicLandAcres
## <chr> <dbl>
## 1 Alabama 665000
## 2 Alaska 22083000
## 3 Arizona 11255000
## 4 Arkansas 2586000
## 5 California 20698000
## 6 Colorado 14509000
## 7 Florida 1153000
## 8 Georgia 865000
## 9 Idaho 20458000
## 10 Illinois 293000
## # ℹ 40 more rows
# Load the data file
employees <- read_csv("data/employees.csv")
employees
## # A tibble: 6 × 4
## FirstName LastName Salary NumDependents
## <chr> <chr> <dbl> <dbl>
## 1 Alexander Hamilton 40000 3
## 2 Aaron Burr 50000 2
## 3 George Washington 60000 1
## 4 Maria Reynolds NA 4
## 5 Angelica Schuyler 10000 NA
## 6 Hercules Mulligan 20000 0
# What happens when we try some aggregate functions?
sum(employees$Salary)
## [1] NA
mean(employees$Salary)
## [1] NA
max(employees$Salary)
## [1] NA
# We can remove missing values before our calculation
sum(employees$Salary, na.rm=TRUE)
## [1] 180000
mean(employees$Salary, na.rm=TRUE)
## [1] 36000
max(employees$Salary, na.rm=TRUE)
## [1] 60000
# Load the data file
continents <- read_csv("data/continents.csv")
continents
## # A tibble: 10 × 3
## Continent Size Population
## <chr> <dbl> <dbl>
## 1 Africa 11668599 1110635000
## 2 Antarctica 5400000 4490
## 3 Antarctica 5400000 4490000
## 4 Asia 17212000 4298723000
## 5 Australia 3291903 38304000
## 6 Europe 3930000 452000
## 7 Europe 3930000 452000
## 8 Europe 3930000 452000
## 9 North America 9540000 565265000
## 10 South America 6890000 406740000
# Remove duplicated rows
continents <- unique(continents)
continents
## # A tibble: 8 × 3
## Continent Size Population
## <chr> <dbl> <dbl>
## 1 Africa 11668599 1110635000
## 2 Antarctica 5400000 4490
## 3 Antarctica 5400000 4490000
## 4 Asia 17212000 4298723000
## 5 Australia 3291903 38304000
## 6 Europe 3930000 452000
## 7 North America 9540000 565265000
## 8 South America 6890000 406740000
# Filter out illogical records for Antarctica
continents <- continents %>%
filter(!(Continent=='Antarctica' & Population>100000))
continents
## # A tibble: 7 × 3
## Continent Size Population
## <chr> <dbl> <dbl>
## 1 Africa 11668599 1110635000
## 2 Antarctica 5400000 4490
## 3 Asia 17212000 4298723000
## 4 Australia 3291903 38304000
## 5 Europe 3930000 452000
## 6 North America 9540000 565265000
## 7 South America 6890000 406740000
# Load the data file
carpinteria <- read_csv("data/population.csv")
carpinteria
## # A tibble: 16 × 2
## Subject Population
## <chr> <dbl>
## 1 Total 13553
## 2 Male 6514
## 3 Female 7039
## 4 Under 5 years 635
## 5 5 to 9 years 554
## 6 10 to 14 years 552
## 7 15 to 19 years 796
## 8 20 to 24 years 1006
## 9 25 to 34 years 1839
## 10 35 to 44 years 1634
## 11 45 to 54 years 1822
## 12 55 to 59 years 1337
## 13 60 to 64 years 1024
## 14 65 to 74 years 1276
## 15 75 to 84 years 642
## 16 85 years and over 436
# What is the population of Carpinteria?
sum(carpinteria$Population)
## [1] 40659
# Remove total and gender breakouts
carpinteria <- carpinteria %>%
filter(!(Subject %in% c('Total', 'Male', 'Female')))
carpinteria
## # A tibble: 13 × 2
## Subject Population
## <chr> <dbl>
## 1 Under 5 years 635
## 2 5 to 9 years 554
## 3 10 to 14 years 552
## 4 15 to 19 years 796
## 5 20 to 24 years 1006
## 6 25 to 34 years 1839
## 7 35 to 44 years 1634
## 8 45 to 54 years 1822
## 9 55 to 59 years 1337
## 10 60 to 64 years 1024
## 11 65 to 74 years 1276
## 12 75 to 84 years 642
## 13 85 years and over 436
# What is the population of Carpinteria?
sum(carpinteria$Population)
## [1] 13553
library(lubridate)
# Load the data file
weather <- read_csv("data/mexicanweather.csv")
# Let's look at what we have
weather
## # A tibble: 33,712 × 4
## station element value date
## <chr> <chr> <dbl> <date>
## 1 MX000017004 TMAX 310 1955-04-01
## 2 MX000017004 TMIN 150 1955-04-01
## 3 MX000017004 TMAX 310 1955-05-01
## 4 MX000017004 TMIN 200 1955-05-01
## 5 MX000017004 TMAX 300 1955-06-01
## 6 MX000017004 TMIN 160 1955-06-01
## 7 MX000017004 TMAX 270 1955-07-01
## 8 MX000017004 TMIN 150 1955-07-01
## 9 MX000017004 TMAX 230 1955-08-01
## 10 MX000017004 TMIN 140 1955-08-01
## # ℹ 33,702 more rows
# We can use lubridate functions to extract elements of the date
weather$year <- year(weather$date)
weather$month <- month(weather$date)
weather$day <- day(weather$date)
weather
## # A tibble: 33,712 × 7
## station element value date year month day
## <chr> <chr> <dbl> <date> <dbl> <dbl> <int>
## 1 MX000017004 TMAX 310 1955-04-01 1955 4 1
## 2 MX000017004 TMIN 150 1955-04-01 1955 4 1
## 3 MX000017004 TMAX 310 1955-05-01 1955 5 1
## 4 MX000017004 TMIN 200 1955-05-01 1955 5 1
## 5 MX000017004 TMAX 300 1955-06-01 1955 6 1
## 6 MX000017004 TMIN 160 1955-06-01 1955 6 1
## 7 MX000017004 TMAX 270 1955-07-01 1955 7 1
## 8 MX000017004 TMIN 150 1955-07-01 1955 7 1
## 9 MX000017004 TMAX 230 1955-08-01 1955 8 1
## 10 MX000017004 TMIN 140 1955-08-01 1955 8 1
## # ℹ 33,702 more rows
# We can also extract some derived values such as the weekday
wday("2018-04-01")
## [1] 1
# or day of the year
yday("2018-04-01")
## [1] 91
# We can also use lubridate to create date values out of different strings
mdy("04/01/2018")
## [1] "2018-04-01"
mdy("04/01/18")
## [1] "2018-04-01"
dmy("04/01/18")
## [1] "2018-01-04"
ymd("2018-04-01")
## [1] "2018-04-01"
# And we can include times
ymd_hms("2018-04-01 08:00:00")
## [1] "2018-04-01 08:00:00 UTC"
# Let's force that to eastern time
ymd_hms("2018-04-01 08:00:00", tz='EST')
## [1] "2018-04-01 08:00:00 EST"
# Load the data file
weather <- read_csv("data/mexicanweather.csv")
weather
## # A tibble: 33,712 × 4
## station element value date
## <chr> <chr> <dbl> <date>
## 1 MX000017004 TMAX 310 1955-04-01
## 2 MX000017004 TMIN 150 1955-04-01
## 3 MX000017004 TMAX 310 1955-05-01
## 4 MX000017004 TMIN 200 1955-05-01
## 5 MX000017004 TMAX 300 1955-06-01
## 6 MX000017004 TMIN 160 1955-06-01
## 7 MX000017004 TMAX 270 1955-07-01
## 8 MX000017004 TMIN 150 1955-07-01
## 9 MX000017004 TMAX 230 1955-08-01
## 10 MX000017004 TMIN 140 1955-08-01
## # ℹ 33,702 more rows
# Let's make this dataset t a little wider to get the
# minimum and maximum temperatures as part of the same observation.
# That requires the spread function
weather <- weather %>%
spread(element, value)
weather
## # A tibble: 16,871 × 4
## station date TMAX TMIN
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 310 150
## 2 MX000017004 1955-04-02 310 150
## 3 MX000017004 1955-04-03 310 160
## 4 MX000017004 1955-04-04 320 150
## 5 MX000017004 1955-04-05 330 160
## 6 MX000017004 1955-04-06 320 160
## 7 MX000017004 1955-04-07 320 160
## 8 MX000017004 1955-04-08 330 160
## 9 MX000017004 1955-04-09 330 160
## 10 MX000017004 1955-04-10 330 170
## # ℹ 16,861 more rows
# That's the right format, but take a look at the data
# It's pretty sparse. We really don't need all of those lines that have two NA values
weather <- weather %>%
filter(!(is.na(TMAX) & is.na(TMIN)))
weather
## # A tibble: 11,904 × 4
## station date TMAX TMIN
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 310 150
## 2 MX000017004 1955-04-02 310 150
## 3 MX000017004 1955-04-03 310 160
## 4 MX000017004 1955-04-04 320 150
## 5 MX000017004 1955-04-05 330 160
## 6 MX000017004 1955-04-06 320 160
## 7 MX000017004 1955-04-07 320 160
## 8 MX000017004 1955-04-08 330 160
## 9 MX000017004 1955-04-09 330 160
## 10 MX000017004 1955-04-10 330 170
## # ℹ 11,894 more rows
# Let's make those column names nicer
# And just for tidyness, let's put the min before the max
weather <- weather %>%
rename(maxtemp=TMAX, mintemp=TMIN) %>%
select(station, date, mintemp, maxtemp)
head(weather, n=20)
## # A tibble: 20 × 4
## station date mintemp maxtemp
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 150 310
## 2 MX000017004 1955-04-02 150 310
## 3 MX000017004 1955-04-03 160 310
## 4 MX000017004 1955-04-04 150 320
## 5 MX000017004 1955-04-05 160 330
## 6 MX000017004 1955-04-06 160 320
## 7 MX000017004 1955-04-07 160 320
## 8 MX000017004 1955-04-08 160 330
## 9 MX000017004 1955-04-09 160 330
## 10 MX000017004 1955-04-10 170 330
## 11 MX000017004 1955-04-11 170 330
## 12 MX000017004 1955-04-12 160 320
## 13 MX000017004 1955-04-13 160 310
## 14 MX000017004 1955-04-14 160 310
## 15 MX000017004 1955-04-15 170 320
## 16 MX000017004 1955-04-16 170 320
## 17 MX000017004 1955-04-17 160 320
## 18 MX000017004 1955-04-18 160 310
## 19 MX000017004 1955-04-19 160 310
## 20 MX000017004 1955-04-20 160 320
# First, divide the temperatures by 10 to get them in degrees Celsius
weather <- weather %>%
mutate(mintemp = mintemp/10) %>%
mutate(maxtemp = maxtemp/10)
weather
## # A tibble: 11,904 × 4
## station date mintemp maxtemp
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 15 31
## 2 MX000017004 1955-04-02 15 31
## 3 MX000017004 1955-04-03 16 31
## 4 MX000017004 1955-04-04 15 32
## 5 MX000017004 1955-04-05 16 33
## 6 MX000017004 1955-04-06 16 32
## 7 MX000017004 1955-04-07 16 32
## 8 MX000017004 1955-04-08 16 33
## 9 MX000017004 1955-04-09 16 33
## 10 MX000017004 1955-04-10 17 33
## # ℹ 11,894 more rows
# Next, convert them to Fahrenheit
weather <- weather %>%
mutate(mintemp=mintemp*(9/5)+32) %>%
mutate(maxtemp=maxtemp*(9/5)+32)
weather
## # A tibble: 11,904 × 4
## station date mintemp maxtemp
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 59 87.8
## 2 MX000017004 1955-04-02 59 87.8
## 3 MX000017004 1955-04-03 60.8 87.8
## 4 MX000017004 1955-04-04 59 89.6
## 5 MX000017004 1955-04-05 60.8 91.4
## 6 MX000017004 1955-04-06 60.8 89.6
## 7 MX000017004 1955-04-07 60.8 89.6
## 8 MX000017004 1955-04-08 60.8 91.4
## 9 MX000017004 1955-04-09 60.8 91.4
## 10 MX000017004 1955-04-10 62.6 91.4
## # ℹ 11,894 more rows
# Load the data file
names <- c("DRG", "ProviderID", "Name", "Address", "City", "State", "ZIP", "Region",
"Discharges", "AverageCharges", "AverageTotalPayments", "AverageMedicarePayments")
inpatient <- read_tsv("data/inpatient.tsv", skip=1, col_names=names)
inpatient
## # A tibble: 163,065 × 12
## DRG ProviderID Name Address City State ZIP Region Discharges
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 039 - EXTRACRAN… 10001 SOUT… 1108 R… DOTH… AL 36301 AL - … 91
## 2 039 - EXTRACRAN… 10005 MARS… 2505 U… BOAZ AL 35957 AL - … 14
## 3 039 - EXTRACRAN… 10006 ELIZ… 205 MA… FLOR… AL 35631 AL - … 24
## 4 039 - EXTRACRAN… 10011 ST V… 50 MED… BIRM… AL 35235 AL - … 25
## 5 039 - EXTRACRAN… 10016 SHEL… 1000 F… ALAB… AL 35007 AL - … 18
## 6 039 - EXTRACRAN… 10023 BAPT… 2105 E… MONT… AL 36116 AL - … 67
## 7 039 - EXTRACRAN… 10029 EAST… 2000 P… OPEL… AL 36801 AL - … 51
## 8 039 - EXTRACRAN… 10033 UNIV… 619 SO… BIRM… AL 35233 AL - … 32
## 9 039 - EXTRACRAN… 10039 HUNT… 101 SI… HUNT… AL 35801 AL - … 135
## 10 039 - EXTRACRAN… 10040 GADS… 1007 G… GADS… AL 35903 AL - … 34
## # ℹ 163,055 more rows
## # ℹ 3 more variables: AverageCharges <chr>, AverageTotalPayments <chr>,
## # AverageMedicarePayments <chr>
# Review a summary of the data
str(inpatient)
## spc_tbl_ [163,065 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DRG : chr [1:163065] "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" ...
## $ ProviderID : num [1:163065] 10001 10005 10006 10011 10016 ...
## $ Name : chr [1:163065] "SOUTHEAST ALABAMA MEDICAL CENTER" "MARSHALL MEDICAL CENTER SOUTH" "ELIZA COFFEE MEMORIAL HOSPITAL" "ST VINCENT'S EAST" ...
## $ Address : chr [1:163065] "1108 ROSS CLARK CIRCLE" "2505 U S HIGHWAY 431 NORTH" "205 MARENGO STREET" "50 MEDICAL PARK EAST DRIVE" ...
## $ City : chr [1:163065] "DOTHAN" "BOAZ" "FLORENCE" "BIRMINGHAM" ...
## $ State : chr [1:163065] "AL" "AL" "AL" "AL" ...
## $ ZIP : chr [1:163065] "36301" "35957" "35631" "35235" ...
## $ Region : chr [1:163065] "AL - Dothan" "AL - Birmingham" "AL - Birmingham" "AL - Birmingham" ...
## $ Discharges : num [1:163065] 91 14 24 25 18 67 51 32 135 34 ...
## $ AverageCharges : chr [1:163065] "$32,963.07" "$15,131.85" "$37,560.37" "$13,998.28" ...
## $ AverageTotalPayments : chr [1:163065] "$5,777.24" "$5,787.57" "$5,434.95" "$5,417.56" ...
## $ AverageMedicarePayments: chr [1:163065] "$4,763.73" "$4,976.71" "$4,453.79" "$4,129.16" ...
## - attr(*, "spec")=
## .. cols(
## .. DRG = col_character(),
## .. ProviderID = col_double(),
## .. Name = col_character(),
## .. Address = col_character(),
## .. City = col_character(),
## .. State = col_character(),
## .. ZIP = col_character(),
## .. Region = col_character(),
## .. Discharges = col_double(),
## .. AverageCharges = col_character(),
## .. AverageTotalPayments = col_character(),
## .. AverageMedicarePayments = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Force the currency fields to numeric
types <- 'ciccccccinnn'
inpatient <- read_tsv("data/inpatient.tsv", skip=1, col_names=names, col_types=types)
str(inpatient)
## spc_tbl_ [163,065 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DRG : chr [1:163065] "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC" ...
## $ ProviderID : int [1:163065] 10001 10005 10006 10011 10016 10023 10029 10033 10039 10040 ...
## $ Name : chr [1:163065] "SOUTHEAST ALABAMA MEDICAL CENTER" "MARSHALL MEDICAL CENTER SOUTH" "ELIZA COFFEE MEMORIAL HOSPITAL" "ST VINCENT'S EAST" ...
## $ Address : chr [1:163065] "1108 ROSS CLARK CIRCLE" "2505 U S HIGHWAY 431 NORTH" "205 MARENGO STREET" "50 MEDICAL PARK EAST DRIVE" ...
## $ City : chr [1:163065] "DOTHAN" "BOAZ" "FLORENCE" "BIRMINGHAM" ...
## $ State : chr [1:163065] "AL" "AL" "AL" "AL" ...
## $ ZIP : chr [1:163065] "36301" "35957" "35631" "35235" ...
## $ Region : chr [1:163065] "AL - Dothan" "AL - Birmingham" "AL - Birmingham" "AL - Birmingham" ...
## $ Discharges : int [1:163065] 91 14 24 25 18 67 51 32 135 34 ...
## $ AverageCharges : num [1:163065] 32963 15132 37560 13998 31633 ...
## $ AverageTotalPayments : num [1:163065] 5777 5788 5435 5418 5658 ...
## $ AverageMedicarePayments: num [1:163065] 4764 4977 4454 4129 4851 ...
## - attr(*, "spec")=
## .. cols(
## .. DRG = col_character(),
## .. ProviderID = col_integer(),
## .. Name = col_character(),
## .. Address = col_character(),
## .. City = col_character(),
## .. State = col_character(),
## .. ZIP = col_character(),
## .. Region = col_character(),
## .. Discharges = col_integer(),
## .. AverageCharges = col_number(),
## .. AverageTotalPayments = col_number(),
## .. AverageMedicarePayments = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
# Load the data file
capitals <- read_csv("data/capitals.csv")
capitals
## # A tibble: 50 × 4
## State Capital ZIP Population
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Montgomery 36101 361748
## 2 Alaska Juneau 99801 30737
## 3 Arizona Phoenix 85001 4039182
## 4 Arkansas Little Rock 72201 652834
## 5 California Sacramento 94203 2067117
## 6 Colorado Denver 80201 2408750
## 7 Connecticut Hartford 6101 1188841
## 8 Delaware Dover 19901 147601
## 9 Florida Tallahassee 32301 336502
## 10 Georgia Atlanta 30301 5138223
## # ℹ 40 more rows
head(capitals)
## # A tibble: 6 × 4
## State Capital ZIP Population
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Montgomery 36101 361748
## 2 Alaska Juneau 99801 30737
## 3 Arizona Phoenix 85001 4039182
## 4 Arkansas Little Rock 72201 652834
## 5 California Sacramento 94203 2067117
## 6 Colorado Denver 80201 2408750
tail(capitals)
## # A tibble: 6 × 4
## State Capital ZIP Population
## <chr> <chr> <dbl> <dbl>
## 1 Vermont Montpelier 5601 7954
## 2 Virginia Richmond 23218 1194008
## 3 Washington Olympia 98501 234670
## 4 West Virginia Charleston 25301 305526
## 5 Wisconsin Madison 53701 543022
## 6 Wyoming Cheyenne 82001 85384
# Load the corrected data file
capitals <- read_csv("data/capitals_corrected.csv")
head(capitals)
## # A tibble: 6 × 4
## State Capital ZIP Population
## <chr> <chr> <chr> <dbl>
## 1 Alabama Montgomery 36101 361748
## 2 Alaska Juneau 99801 30737
## 3 Arizona Phoenix 85001 4039182
## 4 Arkansas Little Rock 72201 652834
## 5 California Sacramento 94203 2067117
## 6 Colorado Denver 80201 2408750
tail(capitals)
## # A tibble: 6 × 4
## State Capital ZIP Population
## <chr> <chr> <chr> <dbl>
## 1 Vermont Montpelier 05601 7954
## 2 Virginia Richmond 23218 1194008
## 3 Washington Olympia 98501 234670
## 4 West Virginia Charleston 25301 305526
## 5 Wisconsin Madison 53701 543022
## 6 Wyoming Cheyenne 82001 85384
# Load the data file
names <- c('InspectionID', 'RestaurantName', 'OtherName', 'LicenseNumber', 'FacilityType', 'Risk',
'Address', 'City', 'State', 'ZIP', 'InspectionDate', 'InspectionType', 'Results',
'Violations', 'Latitude', 'Longitude', 'Location')
inspections <- read_csv("data/inspections.csv", col_names=names, skip=1)
inspections
## # A tibble: 145,606 × 17
## InspectionID RestaurantName OtherName LicenseNumber FacilityType Risk
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 1995829 SUBWAY SUBWAY 1679112 Restaurant Risk…
## 2 1995817 VENEZUELAN BOWL GRIL… VENEZUEL… 2424110 Restaurant Risk…
## 3 1995822 SEVEN TEN SEVEN TEN 1172093 Restaurant Risk…
## 4 1995814 CHISME EXPRESS INC. CHISME E… 1334960 Restaurant Risk…
## 5 1995811 THE NILE RESTAURANT THE NILE… 2334190 Restaurant Risk…
## 6 1995752 WINGSTOP WINGSTOP 2517730 Restaurant Risk…
## 7 1995226 JOY AND JAMES CAFE JOY AND… 2073555 Restaurant Risk…
## 8 1995808 SUSHI MON SUSHI MON 2517725 Restaurant Risk…
## 9 1995807 JEANE KENNEY YMCA DA… JEANE KE… 2215708 Daycare Com… Risk…
## 10 1995802 PEPE'S MEXICAN FOOD PEPE'S M… 48820 Restaurant Risk…
## # ℹ 145,596 more rows
## # ℹ 11 more variables: Address <chr>, City <chr>, State <chr>, ZIP <dbl>,
## # InspectionDate <chr>, InspectionType <chr>, Results <chr>,
## # Violations <chr>, Latitude <dbl>, Longitude <dbl>, Location <chr>
# Check most inspected restaurants
inspections %>%
group_by(RestaurantName) %>%
summarize(inspections=n()) %>%
arrange(desc(inspections))
## # A tibble: 24,032 × 2
## RestaurantName inspections
## <chr> <int>
## 1 SUBWAY 2020
## 2 DUNKIN DONUTS 1023
## 3 MCDONALD'S 458
## 4 7-ELEVEN 363
## 5 MCDONALDS 260
## 6 CHIPOTLE MEXICAN GRILL 243
## 7 POTBELLY SANDWICH WORKS LLC 212
## 8 CORNER BAKERY CAFE 181
## 9 SPORTSERVICE SOLDIER FIELD 176
## 10 POTBELLY SANDWICH WORKS 166
## # ℹ 24,022 more rows
# Find alternate spellings of McDonalds
inspections %>%
filter(grepl("McDo", RestaurantName, ignore.case=TRUE)) %>%
filter(RestaurantName!='SARAH MCDONALD STEELE') %>%
select(RestaurantName) %>%
unique() %>%
head(n=20)
## # A tibble: 20 × 1
## RestaurantName
## <chr>
## 1 MCDONALD
## 2 MCDONALDS
## 3 MCDONALD'S #20104
## 4 MCDONALD'S
## 5 MCDONALDS #29307
## 6 McDONALD'S
## 7 MCDONALD'S RESTAURANTS
## 8 McDonald's Restaurant
## 9 McDonald's
## 10 McDONALDS
## 11 McDonalds
## 12 MCDONALD'S RESTAURANT
## 13 MCDONALD'S #4305
## 14 MCDONALD'S CORPORATION
## 15 MCDONALD'S #5813
## 16 McDowell
## 17 McDonald's #4308
## 18 McDONALD'S # 17277
## 19 McDONALD'S # 12785
## 20 McDONALD'S # 17274
# Create a vector of those alternate spellings
alternates <- inspections %>%
filter(grepl("McDo", RestaurantName, ignore.case=TRUE)) %>%
filter(RestaurantName!='SARAH MCDONALD STEELE') %>%
select(RestaurantName) %>%
unique() %>%
pull(RestaurantName)
alternates
## [1] "MCDONALD"
## [2] "MCDONALDS"
## [3] "MCDONALD'S #20104"
## [4] "MCDONALD'S"
## [5] "MCDONALDS #29307"
## [6] "McDONALD'S"
## [7] "MCDONALD'S RESTAURANTS"
## [8] "McDonald's Restaurant"
## [9] "McDonald's"
## [10] "McDONALDS"
## [11] "McDonalds"
## [12] "MCDONALD'S RESTAURANT"
## [13] "MCDONALD'S #4305"
## [14] "MCDONALD'S CORPORATION"
## [15] "MCDONALD'S #5813"
## [16] "McDowell"
## [17] "McDonald's #4308"
## [18] "McDONALD'S # 17277"
## [19] "McDONALD'S # 12785"
## [20] "McDONALD'S # 17274"
## [21] "McDONALDS # 17278"
## [22] "MCDONALDS RESTAURANT"
## [23] "MCDONALD'S #26364"
## [24] "McDONALD'S # 22821"
## [25] "MCDONALD'S #490"
## [26] "MCDONALD'S 26371"
## [27] "MCDONALDS #12003"
## [28] "MCDONALD'S #6491"
## [29] "MCDONALDS RESTAURANT # 14173"
## [30] "MCDONALDS #27672"
## [31] "McDONALD'S STORE #4061"
## [32] "MCDONALD'S REST. OF ILLINOIS, INC"
## [33] "MCDONALDS #5471"
## [34] "MCDONALD' S # 5618"
## [35] "McDonalds Restaurant"
## [36] "MCDONALDS #4655"
## [37] "MCDONALD'S #6337"
## [38] "MCDONALD'S RESTAURANT OF ILLINOIS,INC"
## [39] "MCDONALD'S #11290"
## [40] "MCDONALDS 812"
## [41] "McDonalds #6337"
## [42] "MCDONALD'S #6310"
## [43] "MCDONALDS #7069"
## [44] "MCDONALDS'S RESTAURANTS"
## [45] "MCDONALD'S REST. OF ILL, INC"
## [46] "MCDONALDS #7463"
## [47] "ROCK & ROLL MCDONALDS"
## [48] "MCDONALD'S #5246"
## [49] "ERNEST CORP / MCDONALD'S"
## [50] "MCDONALDS #490"
## [51] "MCDONALD' S RESTAURANT"
## [52] "mcdonalds"
# Replace them all with MCDONALDS
inspections <- inspections %>%
mutate(RestaurantName=ifelse(RestaurantName %in% alternates, 'MCDONALDS', RestaurantName))
inspections
## # A tibble: 145,606 × 17
## InspectionID RestaurantName OtherName LicenseNumber FacilityType Risk
## <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 1995829 SUBWAY SUBWAY 1679112 Restaurant Risk…
## 2 1995817 VENEZUELAN BOWL GRIL… VENEZUEL… 2424110 Restaurant Risk…
## 3 1995822 SEVEN TEN SEVEN TEN 1172093 Restaurant Risk…
## 4 1995814 CHISME EXPRESS INC. CHISME E… 1334960 Restaurant Risk…
## 5 1995811 THE NILE RESTAURANT THE NILE… 2334190 Restaurant Risk…
## 6 1995752 WINGSTOP WINGSTOP 2517730 Restaurant Risk…
## 7 1995226 JOY AND JAMES CAFE JOY AND… 2073555 Restaurant Risk…
## 8 1995808 SUSHI MON SUSHI MON 2517725 Restaurant Risk…
## 9 1995807 JEANE KENNEY YMCA DA… JEANE KE… 2215708 Daycare Com… Risk…
## 10 1995802 PEPE'S MEXICAN FOOD PEPE'S M… 48820 Restaurant Risk…
## # ℹ 145,596 more rows
## # ℹ 11 more variables: Address <chr>, City <chr>, State <chr>, ZIP <dbl>,
## # InspectionDate <chr>, InspectionType <chr>, Results <chr>,
## # Violations <chr>, Latitude <dbl>, Longitude <dbl>, Location <chr>
# Check most inspected restaurants again
inspections %>%
group_by(RestaurantName) %>%
summarize(inspections=n()) %>%
arrange(desc(inspections))
## # A tibble: 23,981 × 2
## RestaurantName inspections
## <chr> <int>
## 1 SUBWAY 2020
## 2 MCDONALDS 1457
## 3 DUNKIN DONUTS 1023
## 4 7-ELEVEN 363
## 5 CHIPOTLE MEXICAN GRILL 243
## 6 POTBELLY SANDWICH WORKS LLC 212
## 7 CORNER BAKERY CAFE 181
## 8 SPORTSERVICE SOLDIER FIELD 176
## 9 POTBELLY SANDWICH WORKS 166
## 10 DOMINO'S PIZZA 162
## # ℹ 23,971 more rows
# Load the data file
whitehouse <- read_csv("data/whitehouse.csv", col_types="ccncci")
whitehouse
## # A tibble: 2,784 × 6
## Name Status Salary `Pay Basis` `Position Title` Year
## <chr> <chr> <dbl> <chr> <chr> <int>
## 1 Abrams, Adam W. Employee 70000 Per Annum REGIONAL COMMUNICATION… 2011
## 2 Abrevaya, Sandra Employee 90000 Per Annum ASSOCIATE COMMUNICATIO… 2011
## 3 Agnew, David P. Employee 93840 Per Annum DEPUTY DIRECTOR OF INT… 2011
## 4 Albino, James N. Employee 93000 Per Annum SENIOR PROGRAM MANAGER 2011
## 5 Alley, Hilary J. Employee 45000 Per Annum ASSOCIATE DIRECTOR 2011
## 6 Anderson, Amanda D. Employee 80000 Per Annum SENIOR LEGISLATIVE AFF… 2011
## 7 Anderson, Brooke D. Employee 147500 Per Annum DEPUTY ASSISTANT TO TH… 2011
## 8 Apsel, Sarah Detailee 108717 Per Annum POLICY ASSISTANT 2011
## 9 Arguelles, Adam J. Employee 102000 Per Annum SPECIAL ASSISTANT TO T… 2011
## 10 Asen, Jonathan D. Employee 45000 Per Annum LEGISLATIVE ASSISTANT … 2011
## # ℹ 2,774 more rows
# Look at a boxplot of salary data
boxplot(whitehouse$Salary)
# Find salaries over $1,000,000
whitehouse %>%
filter(Salary>1000000)
## # A tibble: 2 × 6
## Name Status Salary `Pay Basis` `Position Title` Year
## <chr> <chr> <dbl> <chr> <chr> <int>
## 1 Case, Michael A. Employee 5000000 Per Annum SENIOR WRITER 2011
## 2 Blair, Patricia A. Employee 9866900 Per Annum CHIEF CALLIGRAPHER 2015
# Set salaries over $1,000,000 to NA
whitehouse <- whitehouse %>%
mutate(Salary=ifelse(Salary>1000000, NA, Salary))
# Rerun the box plot
boxplot(whitehouse$Salary)
# Find the salary over $200,000
whitehouse %>%
filter(Salary>200000)
## # A tibble: 1 × 6
## Name Status Salary `Pay Basis` `Position Title` Year
## <chr> <chr> <dbl> <chr> <chr> <int>
## 1 Wheeler, Seth F. Detailee 225000 Per Annum SENIOR POLICY ADVISOR 2013
# Load the data file
tests <- read_csv("data/testscores.csv")
tests
## # A tibble: 217 × 4
## studentID age grade testScore
## <dbl> <dbl> <dbl> <dbl>
## 1 10001 9 4 89
## 2 10002 8 2 88
## 3 10003 8 3 81
## 4 10004 10 5 72
## 5 10005 10 5 72
## 6 10006 10 5 74
## 7 10007 8 2 85
## 8 10008 8 2 65
## 9 10009 8 2 83
## 10 10010 12 7 85
## # ℹ 207 more rows
# Look at summary statistics
summary(tests)
## studentID age grade testScore
## Min. :10001 Min. : 5.000 Min. :1.0 Min. : 59.00
## 1st Qu.:10055 1st Qu.: 8.000 1st Qu.:3.0 1st Qu.: 75.00
## Median :10109 Median :10.000 Median :4.0 Median : 83.00
## Mean :10109 Mean : 9.871 Mean :4.3 Mean : 82.43
## 3rd Qu.:10163 3rd Qu.:11.000 3rd Qu.:6.0 3rd Qu.: 89.00
## Max. :10217 Max. :39.000 Max. :8.0 Max. :114.00
boxplot(tests$age)
# Investigate outliers
tests %>%
filter(age>15)
## # A tibble: 2 × 4
## studentID age grade testScore
## <dbl> <dbl> <dbl> <dbl>
## 1 10115 39 2 91
## 2 10116 26 7 70
# Remove those two cases and reexamine boxplot
tests <- tests %>%
mutate(age=ifelse(studentID==10115, 7, age)) %>%
mutate(age=ifelse(studentID==10116, 12, age))
boxplot(tests$age)
# But look at it by grade level
boxplot(tests$age~tests$grade)
## 03-Detecting Illogical Values
# Load the data file
residents <- read_csv("data/residents.csv", col_types='iillll')
residents
## # A tibble: 2,063 × 6
## personID age employed ownsHome rentsHome ownsCar
## <int> <int> <lgl> <lgl> <lgl> <lgl>
## 1 1 50 TRUE TRUE FALSE TRUE
## 2 2 64 TRUE FALSE TRUE TRUE
## 3 3 39 FALSE FALSE TRUE TRUE
## 4 4 39 FALSE TRUE FALSE FALSE
## 5 5 51 TRUE TRUE FALSE TRUE
## 6 6 39 TRUE TRUE FALSE TRUE
## 7 7 45 TRUE TRUE FALSE TRUE
## 8 8 64 TRUE TRUE FALSE TRUE
## 9 9 42 TRUE FALSE TRUE FALSE
## 10 10 36 FALSE FALSE TRUE TRUE
## # ℹ 2,053 more rows
# Look at a summary of the data
summary(residents)
## personID age employed ownsHome
## Min. : 1.0 Min. :18.00 Mode :logical Mode :logical
## 1st Qu.: 516.5 1st Qu.:31.00 FALSE:424 FALSE:837
## Median :1032.0 Median :42.00 TRUE :1639 TRUE :1226
## Mean :1032.0 Mean :42.22
## 3rd Qu.:1547.5 3rd Qu.:54.00
## Max. :2063.0 Max. :65.00
## rentsHome ownsCar
## Mode :logical Mode :logical
## FALSE:1224 FALSE:635
## TRUE :839 TRUE :1428
##
##
##
# Find unusual cases
residents %>%
filter(ownsHome==rentsHome)
## # A tibble: 6 × 6
## personID age employed ownsHome rentsHome ownsCar
## <int> <int> <lgl> <lgl> <lgl> <lgl>
## 1 17 55 TRUE FALSE FALSE TRUE
## 2 134 19 TRUE TRUE TRUE FALSE
## 3 203 43 FALSE TRUE TRUE TRUE
## 4 382 60 FALSE TRUE TRUE TRUE
## 5 1269 56 TRUE TRUE TRUE TRUE
## 6 1902 31 TRUE FALSE FALSE FALSE
# Load the data file
pew <- read_csv("data/pew.csv")
# Examine the data
pew
## # A tibble: 18 × 11
## religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k` `$50-75k` `$75-100k`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agnostic 27 34 60 81 76 137 122
## 2 Atheist 12 27 37 52 35 70 73
## 3 Buddhist 27 21 30 34 33 58 62
## 4 Catholic 418 617 732 670 638 1116 949
## 5 Don't k… 15 14 15 11 10 35 21
## 6 Evangel… 575 869 1064 982 881 1486 949
## 7 Hindu 1 9 7 9 11 34 47
## 8 Histori… 228 244 236 238 197 223 131
## 9 Jehovah… 20 27 24 24 21 30 15
## 10 Jewish 19 19 25 25 30 95 69
## 11 Mainlin… 289 495 619 655 651 1107 939
## 12 Mormon 29 40 48 51 56 112 85
## 13 Muslim 6 7 9 10 9 23 16
## 14 Orthodox 13 17 23 32 32 47 38
## 15 Other C… 9 7 11 13 13 14 18
## 16 Other F… 20 33 40 49 49 63 46
## 17 Other W… 5 2 3 2 2 7 3
## 18 Unaffil… 217 299 374 341 341 528 407
## # ℹ 3 more variables: `$100-150k` <dbl>, `>$150k` <dbl>,
## # `Don't know/refused` <dbl>
# Make the wide dataset long
pew.long <- gather(pew, income, freq, -religion)
# Examine the new dataset
pew.long
## # A tibble: 180 × 3
## religion income freq
## <chr> <chr> <dbl>
## 1 Agnostic <$10k 27
## 2 Atheist <$10k 12
## 3 Buddhist <$10k 27
## 4 Catholic <$10k 418
## 5 Don't know/refused <$10k 15
## 6 Evangelical Prot <$10k 575
## 7 Hindu <$10k 1
## 8 Historically Black Prot <$10k 228
## 9 Jehovah's Witness <$10k 20
## 10 Jewish <$10k 19
## # ℹ 170 more rows
# Load the data file
weather <- read_csv("data/mexicanweather.csv")
# Examine the data
weather
## # A tibble: 33,712 × 4
## station element value date
## <chr> <chr> <dbl> <date>
## 1 MX000017004 TMAX 310 1955-04-01
## 2 MX000017004 TMIN 150 1955-04-01
## 3 MX000017004 TMAX 310 1955-05-01
## 4 MX000017004 TMIN 200 1955-05-01
## 5 MX000017004 TMAX 300 1955-06-01
## 6 MX000017004 TMIN 160 1955-06-01
## 7 MX000017004 TMAX 270 1955-07-01
## 8 MX000017004 TMIN 150 1955-07-01
## 9 MX000017004 TMAX 230 1955-08-01
## 10 MX000017004 TMIN 140 1955-08-01
## # ℹ 33,702 more rows
# Make the long dataset wider
weather.wide <- spread(weather, element, value)
# Examine the data
weather.wide
## # A tibble: 16,871 × 4
## station date TMAX TMIN
## <chr> <date> <dbl> <dbl>
## 1 MX000017004 1955-04-01 310 150
## 2 MX000017004 1955-04-02 310 150
## 3 MX000017004 1955-04-03 310 160
## 4 MX000017004 1955-04-04 320 150
## 5 MX000017004 1955-04-05 330 160
## 6 MX000017004 1955-04-06 320 160
## 7 MX000017004 1955-04-07 320 160
## 8 MX000017004 1955-04-08 330 160
## 9 MX000017004 1955-04-09 330 160
## 10 MX000017004 1955-04-10 330 170
## # ℹ 16,861 more rows