### Tible from data set
tibb <- as_tibble(CO2)
tibb
## # A tibble: 84 x 5
## Plant Type Treatment conc uptake
## <ord> <fct> <fct> <dbl> <dbl>
## 1 Qn1 Quebec nonchilled 95 16
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
## 7 Qn1 Quebec nonchilled 1000 39.7
## 8 Qn2 Quebec nonchilled 95 13.6
## 9 Qn2 Quebec nonchilled 175 27.3
## 10 Qn2 Quebec nonchilled 250 37.1
## # ... with 74 more rows
### Tibble from scratch
name <- c("Karol", "Matt", "Adam", "Santi")
birthyear <- c(2000, 2001,2002, 2003)
eyes <- c("blue", "black", "green", "brown")
people <- tibble(name, birthyear, eyes)
people
## # A tibble: 4 x 3
## name birthyear eyes
## <chr> <dbl> <chr>
## 1 Karol 2000 blue
## 2 Matt 2001 black
## 3 Adam 2002 green
## 4 Santi 2003 brown
Extracts a single variable as a vector
people$eyes
## [1] "blue" "black" "green" "brown"
unique(people$eyes)
## [1] "blue" "black" "green" "brown"
## Operator
people[["eyes"]]
## [1] "blue" "black" "green" "brown"
people[[3]]
## [1] "blue" "black" "green" "brown"
## CSV files
inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv")
## more info ?read_csv
names<- c("ID", "DBAName", "AkaName", "License", "Facilitytype", "Risk", "Address", "City", "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations", "Latitude", "Longitude", "Location")
inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv",
col.names = names)
glimpse(inspections)
## Rows: 145,606
## Columns: 17
## $ ID <int> 1995829, 1995817, 1995822, 1995814, 1995811, 1995752, 1~
## $ DBAName <chr> "SUBWAY", "VENEZUELAN BOWL GRILL LLC", "SEVEN TEN", "CH~
## $ AkaName <chr> "SUBWAY", "VENEZUELAN BOWL GRILL", "SEVEN TEN", "CHISME~
## $ License <int> 1679112, 2424110, 1172093, 1334960, 2334190, 2517730, 2~
## $ Facilitytype <chr> "Restaurant", "Restaurant", "Restaurant", "Restaurant",~
## $ Risk <chr> "Risk 1 (High)", "Risk 1 (High)", "Risk 1 (High)", "Ris~
## $ Address <chr> "8711 S ASHLAND AVE ", "2436 N LINCOLN AVE ", "1055 E 5~
## $ City <chr> "CHICAGO", "CHICAGO", "CHICAGO", "CHICAGO", "CHICAGO", ~
## $ State <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "~
## $ Zip <int> 60620, 60614, 60615, 60629, 60615, 60621, 60630, 60614,~
## $ InspectionDate <chr> "03/17/2017", "03/17/2017", "03/17/2017", "03/17/2017",~
## $ InspectionType <chr> "Canvass Re-Inspection", "Canvass Re-Inspection", "Canv~
## $ Results <chr> "Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Out of~
## $ Violations <chr> "19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE AREA; CLE~
## $ Latitude <dbl> 41.73541, 41.92619, 41.79492, 41.78429, 41.79517, 41.77~
## $ Longitude <dbl> -87.66291, -87.65002, -87.59985, -87.72277, -87.59710, ~
## $ Location <chr> "(41.735408702583165, -87.66290968424336)", "(41.926190~
inpatient <- read_tsv("C:/Users/karolo/Desktop/data/inpatient.tsv")
## Rows: 163065 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (10): DRG Definition, Provider Name, Provider Street Address, Provider C...
## dbl (2): Provider Id, Total Discharges
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names_i <- c("DRG", "ProviderID", "Name", "Address", "City", "State", "Zip", "Region", "Discharges", "AverageCharges", "AverageTotalPayments", "AverageMedicarePayments")
inpatient <- read.delim("C:/Users/karolo/Desktop/data/inpatient.tsv",
col.names = names_i)
inpatient$Discharges <- as.integer(inpatient$Discharges)
inpatient$ProviderID <- as.character(inpatient$ProviderID)
inpatient$Zip <- as.character(inpatient$Zip)
glimpse(inpatient)
## Rows: 163,065
## Columns: 12
## $ DRG <chr> "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC", "0~
## $ ProviderID <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~
stoppages <- read_delim("C:/Users/karolo/Desktop/data/workstoppages.txt", delim = "^")
## Rows: 434 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: "^"
## chr (1): period
## dbl (2): year, workers
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(stoppages)
## Rows: 434
## Columns: 3
## $ year <dbl> 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 19~
## $ period <chr> "January", "February", "March", "April", "May", "June", "July"~
## $ workers <dbl> 29600, 20900, 207800, 223500, 259000, 415100, 125400, 86600, 6~
## The readxl package isn't part of the core tidyverse so we need to load it separately
library(readxl)
#### Try just reading the file without other arguments
breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx")
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...
glimpse(breakfast)
## Rows: 46
## Columns: 7
## $ `School Breakfast Program Participation and Meals Served` <chr> "(Data as of~
## $ ...2 <chr> NA, " -~
## $ ...3 <chr> NA, NA, "Red~
## $ ...4 <chr> NA, NA, "Pai~
## $ ...5 <chr> NA, NA, "Tot~
## $ ...6 <chr> NA, NA, "Mea~
## $ ...7 <chr> NA, NA, "Fre~
## Try skipping three lines
breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx", skip=3)
glimpse(breakfast)
## Rows: 43
## Columns: 7
## $ `Fiscal Years ` <chr> " ", "1975", "1976", "1977", "1978", "19~
## $ Free <chr> "------------------------------Millions----~
## $ `Red. Price ` <dbl> NA, 0.04, 0.06, 0.11, 0.16, 0.21, 0.25, 0.2~
## $ Paid <dbl> NA, 0.33, 0.37, 0.36, 0.42, 0.54, 0.56, 0.5~
## $ `Total ` <dbl> NA, 1.82, 2.20, 2.49, 2.80, 3.32, 3.60, 3.8~
## $ `Meals Served` <dbl> NA, 294.70, 353.60, 434.30, 478.80, 565.60,~
## $ `Free/ RP of Total Meals ` <chr> "--Percent--", "82.1", "84.2", "85.7", "85.~
# It looks like we need to skip five lines, which will remove the column names
# So lets create a vector with column names
names <- c("Year", "FreeStudents", "ReducedStudents", "PaidStudents", "TotalStudents",
"MealsServed", "PercentFree")
# And then try reading the file again
breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx",skip=5, col_names = names)
glimpse(breakfast)
## Rows: 42
## Columns: 7
## $ Year <dbl> 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, ~
## $ FreeStudents <dbl> 1.45, 1.76, 2.02, 2.23, 2.56, 2.79, 3.05, 2.80, 2.87, ~
## $ ReducedStudents <dbl> 0.04, 0.06, 0.11, 0.16, 0.21, 0.25, 0.25, 0.16, 0.15, ~
## $ PaidStudents <dbl> 0.33, 0.37, 0.36, 0.42, 0.54, 0.56, 0.51, 0.36, 0.34, ~
## $ TotalStudents <dbl> 1.82, 2.20, 2.49, 2.80, 3.32, 3.60, 3.81, 3.32, 3.36, ~
## $ MealsServed <dbl> 294.70, 353.60, 434.30, 478.80, 565.60, 619.90, 644.20~
## $ PercentFree <dbl> 82.1, 84.2, 85.7, 85.3, 84.1, 85.2, 86.9, 89.3, 90.3, ~
# I'll do a little quick manipulation of this tibble.
# First, convert the numbers of students and meals to real values
breakfast <- breakfast %>%
mutate(FreeStudents=FreeStudents*1000000,
ReducedStudents=ReducedStudents * 1000000,
PaidStudents = PaidStudents * 1000000,
TotalStudents = TotalStudents * 1000000,
MealsServed = MealsServed * 1000000,
PercentFree = PercentFree/100)
glimpse(breakfast)
## Rows: 42
## Columns: 7
## $ Year <dbl> 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, ~
## $ FreeStudents <dbl> 1450000, 1760000, 2020000, 2230000, 2560000, 2790000, ~
## $ ReducedStudents <dbl> 40000, 60000, 110000, 160000, 210000, 250000, 250000, ~
## $ PaidStudents <dbl> 330000, 370000, 360000, 420000, 540000, 560000, 510000~
## $ TotalStudents <dbl> 1820000, 2200000, 2490000, 2800000, 3320000, 3600000, ~
## $ MealsServed <dbl> 294700000, 353600000, 434300000, 478800000, 565600000,~
## $ PercentFree <dbl> 0.821, 0.842, 0.857, 0.853, 0.841, 0.852, 0.869, 0.893~
pew <- read.csv("C:/Users/karolo/Desktop/data/pew.csv")
pew_long <- pivot_longer(pew, !religion, names_to = "Income", values_to = "freq")
head(pew_long)
## # A tibble: 6 x 3
## religion Income freq
## <chr> <chr> <int>
## 1 Agnostic X..10k 27
## 2 Agnostic X.10.20k 34
## 3 Agnostic X.20.30k 60
## 4 Agnostic X.30.40k 81
## 5 Agnostic X.40.50k 76
## 6 Agnostic X.50.75k 137
weather <- read.csv("C:/Users/karolo/Desktop/data/mexicanweather.csv")
head(weather)
## station element value date
## 1 MX000017004 TMAX 310 1955-04-01
## 2 MX000017004 TMIN 150 1955-04-01
## 3 MX000017004 TMAX 310 1955-05-01
## 4 MX000017004 TMIN 200 1955-05-01
## 5 MX000017004 TMAX 300 1955-06-01
## 6 MX000017004 TMIN 160 1955-06-01
weatherwide <- pivot_wider(weather, names_from = element, values_from = value)
head(weatherwide)
## # A tibble: 6 x 4
## station date TMAX TMIN
## <chr> <chr> <int> <int>
## 1 MX000017004 1955-04-01 310 150
## 2 MX000017004 1955-05-01 310 200
## 3 MX000017004 1955-06-01 300 160
## 4 MX000017004 1955-07-01 270 150
## 5 MX000017004 1955-08-01 230 140
## 6 MX000017004 1955-09-01 230 150
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## year() extracts the year from a date
## month() extracts the month value
## day() extracts the day of the month
## wday() returns the day of the week
## yday() returns the day of the year
## hour() extracts the hour from a time
## minute() extracts the minute value
## second() extracts the second value
# Date Construction Functions
## April 1, 2018, function mdy()
## 2018-04-01, ymd()
## 01-04-2018, dmy()
## April 1, 2018 04:32:16, mdy_hms
## ymd_hms, dmy_hms
weather_time <- weather %>%
mutate(year= year(date), month= month(date), day= day(date))
head(weather_time)
## station element value date year month day
## 1 MX000017004 TMAX 310 1955-04-01 1955 4 1
## 2 MX000017004 TMIN 150 1955-04-01 1955 4 1
## 3 MX000017004 TMAX 310 1955-05-01 1955 5 1
## 4 MX000017004 TMIN 200 1955-05-01 1955 5 1
## 5 MX000017004 TMAX 300 1955-06-01 1955 6 1
## 6 MX000017004 TMIN 160 1955-06-01 1955 6 1
wday("2018-04-01")
## [1] 1
mdy("04/01/22")
## [1] "2022-04-01"
names<- c("ID", "DBAName", "AkaName", "License", "Facilitytype", "Risk", "Address", "City", "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations", "Latitude", "Longitude", "Location")
inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv",
col.names = names)
# Create a new column called Regions that combines City and State
regional_inspections <- unite(inspections,Region,City,State,sep=", ", remove=FALSE)
# And take a look at the unique regions
unique(regional_inspections$Region)
## [1] "CHICAGO, IL" ", IL"
## [3] "ELK GROVE VILLAGE, IL" "SCHAUMBURG, IL"
## [5] "Chicago, IL" "CALUMET CITY, IL"
## [7] "SKOKIE, IL" "chicago, IL"
## [9] "EVANSTON, IL" "CCHICAGO, IL"
## [11] "NILES NILES, IL" "CHicago, IL"
## [13] "ELMHURST, IL" ", "
## [15] "SCHILLER PARK, IL" "OAK PARK, IL"
## [17] "BRIDEVIEW, IL" "CHESTNUT STREET, IL"
## [19] "BEDFORD PARK, IL" "BANNOCKBURNDEERFIELD, IL"
## [21] "CHCICAGO, IL" "BLOOMINGDALE, IL"
## [23] "CICERO, IL" "Norridge, IL"
## [25] "CHICAGO, " "CHARLES A HAYES, IL"
## [27] "CHICAGOCHICAGO, IL" "MAYWOOD, IL"
## [29] "CHCHICAGO, IL" "CHICAGOI, IL"
## [31] "SUMMIT, IL" "OOLYMPIA FIELDS, IL"
## [33] "WESTMONT, IL" "CHICAGO HEIGHTS, IL"
## [35] "JUSTICE, IL" "TINLEY PARK, IL"
## [37] "LOMBARD, IL" "EAST HAZEL CREST, IL"
## [39] "COUNTRY CLUB HILLS, IL" "STREAMWOOD, IL"
## [41] "BOLINGBROOK, IL" "INACTIVE, IL"
## [43] "BERWYN, IL" "BURNHAM, IL"
## [45] "DES PLAINES, IL" "LAKE ZURICH, IL"
## [47] "OLYMPIA FIELDS, IL" "alsip, IL"
## [49] "OAK LAWN, IL" "BLUE ISLAND, IL"
## [51] "GLENCOE, IL" "FRANKFORT, IL"
## [53] "NAPERVILLE, IL" "BROADVIEW, IL"
## [55] "WORTH, IL" "Maywood, IL"
## [57] "ALSIP, IL" "EVERGREEN PARK, IL"
# We need to load stringr separately
library(stringr)
# Let's handle the uppercase/lowercase issues by converting everything to uppercase
regional_inspections <- regional_inspections %>%
mutate(Region=str_to_upper(Region))
# What were the results of that?
unique(regional_inspections$Region)
## [1] "CHICAGO, IL" ", IL"
## [3] "ELK GROVE VILLAGE, IL" "SCHAUMBURG, IL"
## [5] "CALUMET CITY, IL" "SKOKIE, IL"
## [7] "EVANSTON, IL" "CCHICAGO, IL"
## [9] "NILES NILES, IL" "ELMHURST, IL"
## [11] ", " "SCHILLER PARK, IL"
## [13] "OAK PARK, IL" "BRIDEVIEW, IL"
## [15] "CHESTNUT STREET, IL" "BEDFORD PARK, IL"
## [17] "BANNOCKBURNDEERFIELD, IL" "CHCICAGO, IL"
## [19] "BLOOMINGDALE, IL" "CICERO, IL"
## [21] "NORRIDGE, IL" "CHICAGO, "
## [23] "CHARLES A HAYES, IL" "CHICAGOCHICAGO, IL"
## [25] "MAYWOOD, IL" "CHCHICAGO, IL"
## [27] "CHICAGOI, IL" "SUMMIT, IL"
## [29] "OOLYMPIA FIELDS, IL" "WESTMONT, IL"
## [31] "CHICAGO HEIGHTS, IL" "JUSTICE, IL"
## [33] "TINLEY PARK, IL" "LOMBARD, IL"
## [35] "EAST HAZEL CREST, IL" "COUNTRY CLUB HILLS, IL"
## [37] "STREAMWOOD, IL" "BOLINGBROOK, IL"
## [39] "INACTIVE, IL" "BERWYN, IL"
## [41] "BURNHAM, IL" "DES PLAINES, IL"
## [43] "LAKE ZURICH, IL" "OLYMPIA FIELDS, IL"
## [45] "ALSIP, IL" "OAK LAWN, IL"
## [47] "BLUE ISLAND, IL" "GLENCOE, IL"
## [49] "FRANKFORT, IL" "NAPERVILLE, IL"
## [51] "BROADVIEW, IL" "WORTH, IL"
## [53] "EVERGREEN PARK, IL"
# Let's take care of a few misspellings of Chicago
badchicagos <- c('CCHICAGO, IL', 'CHCICAGO, IL', 'CHICAGOCHICAGO, IL', 'CHCHICAGO, IL', 'CHICAGOI, IL')
regional_inspections <- regional_inspections %>%
mutate(Region=ifelse(Region %in% badchicagos, 'CHICAGO, IL', Region))
# And see what's left
unique(regional_inspections$Region)
## [1] "CHICAGO, IL" ", IL"
## [3] "ELK GROVE VILLAGE, IL" "SCHAUMBURG, IL"
## [5] "CALUMET CITY, IL" "SKOKIE, IL"
## [7] "EVANSTON, IL" "NILES NILES, IL"
## [9] "ELMHURST, IL" ", "
## [11] "SCHILLER PARK, IL" "OAK PARK, IL"
## [13] "BRIDEVIEW, IL" "CHESTNUT STREET, IL"
## [15] "BEDFORD PARK, IL" "BANNOCKBURNDEERFIELD, IL"
## [17] "BLOOMINGDALE, IL" "CICERO, IL"
## [19] "NORRIDGE, IL" "CHICAGO, "
## [21] "CHARLES A HAYES, IL" "MAYWOOD, IL"
## [23] "SUMMIT, IL" "OOLYMPIA FIELDS, IL"
## [25] "WESTMONT, IL" "CHICAGO HEIGHTS, IL"
## [27] "JUSTICE, IL" "TINLEY PARK, IL"
## [29] "LOMBARD, IL" "EAST HAZEL CREST, IL"
## [31] "COUNTRY CLUB HILLS, IL" "STREAMWOOD, IL"
## [33] "BOLINGBROOK, IL" "INACTIVE, IL"
## [35] "BERWYN, IL" "BURNHAM, IL"
## [37] "DES PLAINES, IL" "LAKE ZURICH, IL"
## [39] "OLYMPIA FIELDS, IL" "ALSIP, IL"
## [41] "OAK LAWN, IL" "BLUE ISLAND, IL"
## [43] "GLENCOE, IL" "FRANKFORT, IL"
## [45] "NAPERVILLE, IL" "BROADVIEW, IL"
## [47] "WORTH, IL" "EVERGREEN PARK, IL"
# There are some "CHICAGO, NA" values that we can clearly correct to "CHICAGO, IL"
regional_inspections <- regional_inspections %>%
mutate(Region=ifelse(Region=='CHICAGO, NA', 'CHICAGO, IL', Region))
# But we don't know what to do with "NA, IL", "NA, NA", or "INACTIVE, IL"
# so let's set those to missing values
nachicagos <- c('NA, IL', 'NA, NA', 'INACTIVE, IL')
regional_inspections <- regional_inspections %>%
mutate(Region=ifelse(Region %in% nachicagos, NA, Region))
# How did we do?
unique(regional_inspections$Region)
## [1] "CHICAGO, IL" ", IL"
## [3] "ELK GROVE VILLAGE, IL" "SCHAUMBURG, IL"
## [5] "CALUMET CITY, IL" "SKOKIE, IL"
## [7] "EVANSTON, IL" "NILES NILES, IL"
## [9] "ELMHURST, IL" ", "
## [11] "SCHILLER PARK, IL" "OAK PARK, IL"
## [13] "BRIDEVIEW, IL" "CHESTNUT STREET, IL"
## [15] "BEDFORD PARK, IL" "BANNOCKBURNDEERFIELD, IL"
## [17] "BLOOMINGDALE, IL" "CICERO, IL"
## [19] "NORRIDGE, IL" "CHICAGO, "
## [21] "CHARLES A HAYES, IL" "MAYWOOD, IL"
## [23] "SUMMIT, IL" "OOLYMPIA FIELDS, IL"
## [25] "WESTMONT, IL" "CHICAGO HEIGHTS, IL"
## [27] "JUSTICE, IL" "TINLEY PARK, IL"
## [29] "LOMBARD, IL" "EAST HAZEL CREST, IL"
## [31] "COUNTRY CLUB HILLS, IL" "STREAMWOOD, IL"
## [33] "BOLINGBROOK, IL" NA
## [35] "BERWYN, IL" "BURNHAM, IL"
## [37] "DES PLAINES, IL" "LAKE ZURICH, IL"
## [39] "OLYMPIA FIELDS, IL" "ALSIP, IL"
## [41] "OAK LAWN, IL" "BLUE ISLAND, IL"
## [43] "GLENCOE, IL" "FRANKFORT, IL"
## [45] "NAPERVILLE, IL" "BROADVIEW, IL"
## [47] "WORTH, IL" "EVERGREEN PARK, IL"
# Separate at the fourth position
inpatient_separate <- separate(inpatient,DRG,c('DRGcode','DRGdescription'),4)
# And take a look at the data now
glimpse(inpatient_separate)
## Rows: 163,065
## Columns: 13
## $ DRGcode <chr> "039 ", "039 ", "039 ", "039 ", "039 ", "039 "~
## $ DRGdescription <chr> "- EXTRACRANIAL PROCEDURES W/O CC/MCC", "- EXT~
## $ ProviderID <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~
# Trim the DRGcode field
inpatient_separate <- inpatient_separate %>%
mutate(DRGcode=str_trim(DRGcode))
glimpse(inpatient_separate)
## Rows: 163,065
## Columns: 13
## $ DRGcode <chr> "039", "039", "039", "039", "039", "039", "039~
## $ DRGdescription <chr> "- EXTRACRANIAL PROCEDURES W/O CC/MCC", "- EXT~
## $ ProviderID <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~
# The DRGdescription field has a hyphen in front so we need to do something different
inpatient_separate <- inpatient_separate %>%
mutate(DRGdescription=str_sub(DRGdescription, 3))
glimpse(inpatient_separate)
## Rows: 163,065
## Columns: 13
## $ DRGcode <chr> "039", "039", "039", "039", "039", "039", "039~
## $ DRGdescription <chr> "EXTRACRANIAL PROCEDURES W/O CC/MCC", "EXTRACR~
## $ ProviderID <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~
library(ggplot2)
coal <- read.csv("C:/Users/karolo/Desktop/data/coal.csv")
head(coal, n=2)
## Table..Total.Coal.Consumption..Quadrillion.Btu. X X.1 X.2 X.3 X.4 X.5
## 1
## 2 1980 1981 1982 1983 1984 1985
## X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13 X.14 X.15 X.16 X.17 X.18 X.19 X.20
## 1
## 2 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
## X.21 X.22 X.23 X.24 X.25 X.26 X.27 X.28 X.29
## 1
## 2 2001 2002 2003 2004 2005 2006 2007 2008 2009
coal <- read.csv("C:/Users/karolo/Desktop/data/coal.csv", skip = 2)
head(coal, n=2)
## X X1980 X1981 X1982 X1983 X1984 X1985 X1986
## 1 North America 16.45179 16.98772 16.47546 17.12407 18.4267 18.81819 18.52559
## 2 Bermuda 0 0 0 0 0 0 0
## X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994
## 1 19.43781 20.40363 20.62571 20.5602 20.4251 20.64672 21.28219 21.39631
## 2 0 0 0 0 0 0 0 0
## X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002
## 1 21.64225 22.57572 23.20491 23.5002 23.4747 24.55583 23.62705 23.69876
## 2 0 0 0 0 0 0 0 0
## X2003 X2004 X2005 X2006 X2007 X2008 X2009
## 1 24.17788 24.36024 24.6876 24.32174 24.54746 24.11993 21.14803
## 2 0 0 0 0 0 0 0
# Rename the first column as region
colnames(coal)[1] <- "region"
head(coal, n=2)
## region X1980 X1981 X1982 X1983 X1984 X1985 X1986
## 1 North America 16.45179 16.98772 16.47546 17.12407 18.4267 18.81819 18.52559
## 2 Bermuda 0 0 0 0 0 0 0
## X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994
## 1 19.43781 20.40363 20.62571 20.5602 20.4251 20.64672 21.28219 21.39631
## 2 0 0 0 0 0 0 0 0
## X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002
## 1 21.64225 22.57572 23.20491 23.5002 23.4747 24.55583 23.62705 23.69876
## 2 0 0 0 0 0 0 0 0
## X2003 X2004 X2005 X2006 X2007 X2008 X2009
## 1 24.17788 24.36024 24.6876 24.32174 24.54746 24.11993 21.14803
## 2 0 0 0 0 0 0 0
# Convert from a wide dataset to a long dataset using pivot_longer
coal_long <- coal%>%
pivot_longer(!region, names_to = "year", values_to = "coal_consumption")
head(coal_long)
## # A tibble: 6 x 3
## region year coal_consumption
## <chr> <chr> <chr>
## 1 North America X1980 16.45179
## 2 North America X1981 16.98772
## 3 North America X1982 16.47546
## 4 North America X1983 17.12407
## 5 North America X1984 18.4267
## 6 North America X1985 18.81819
# Convert years to integers
coal_long <- coal_long %>%
mutate(year=as.integer(year))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
summary(coal_long)
## region year coal_consumption
## Length:6960 Min. : NA Length:6960
## Class :character 1st Qu.: NA Class :character
## Mode :character Median : NA Mode :character
## Mean :NaN
## 3rd Qu.: NA
## Max. : NA
## NA's :6960
# Convert coal consumption to numeric
coal_long <- coal_long %>%
mutate(coal_consumption=as.numeric(coal_consumption))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
# Look at region values - they contain both continents and countries
unique(coal_long$region)
## [1] "North America" "Bermuda"
## [3] "Canada" "Greenland"
## [5] "Mexico" "Saint Pierre and Miquelon"
## [7] "United States" "Central & South America"
## [9] "Antarctica" "Antigua and Barbuda"
## [11] "Argentina" "Aruba"
## [13] "Bahamas, The" "Barbados"
## [15] "Belize" "Bolivia"
## [17] "Brazil" "Cayman Islands"
## [19] "Chile" "Colombia"
## [21] "Costa Rica" "Cuba"
## [23] "Dominica" "Dominican Republic"
## [25] "Ecuador" "El Salvador"
## [27] "Falkland Islands (Islas Malvinas)" "French Guiana"
## [29] "Grenada" "Guadeloupe"
## [31] "Guatemala" "Guyana"
## [33] "Haiti" "Honduras"
## [35] "Jamaica" "Martinique"
## [37] "Montserrat" "Netherlands Antilles"
## [39] "Nicaragua" "Panama"
## [41] "Paraguay" "Peru"
## [43] "Puerto Rico" "Saint Kitts and Nevis"
## [45] "Saint Lucia" "Saint Vincent/Grenadines"
## [47] "Suriname" "Trinidad and Tobago"
## [49] "Turks and Caicos Islands" "Uruguay"
## [51] "Venezuela" "Virgin Islands, U.S."
## [53] "Virgin Islands, British" "Europe"
## [55] "Albania" "Austria"
## [57] "Belgium" "Bosnia and Herzegovina"
## [59] "Bulgaria" "Croatia"
## [61] "Cyprus" "Czech Republic"
## [63] "Denmark" "Faroe Islands"
## [65] "Finland" "Former Czechoslovakia"
## [67] "Former Serbia and Montenegro" "Former Yugoslavia"
## [69] "France" "Germany"
## [71] "Germany, East" "Germany, West"
## [73] "Gibraltar" "Greece"
## [75] "Hungary" "Iceland"
## [77] "Ireland" "Italy"
## [79] "Luxembourg" "Macedonia"
## [81] "Malta" "Montenegro"
## [83] "Netherlands" "Norway"
## [85] "Poland" "Portugal"
## [87] "Romania" "Serbia"
## [89] "Slovakia" "Slovenia"
## [91] "Spain" "Sweden"
## [93] "Switzerland" "Turkey"
## [95] "United Kingdom" "Eurasia"
## [97] "Armenia" "Azerbaijan"
## [99] "Belarus" "Estonia"
## [101] "Former U.S.S.R." "Georgia"
## [103] "Kazakhstan" "Kyrgyzstan"
## [105] "Latvia" "Lithuania"
## [107] "Moldova" "Russia"
## [109] "Tajikistan" "Turkmenistan"
## [111] "Ukraine" "Uzbekistan"
## [113] "Middle East" "Bahrain"
## [115] "Iran" "Iraq"
## [117] "Israel" "Jordan"
## [119] "Kuwait" "Lebanon"
## [121] "Oman" "Palestine"
## [123] "Qatar" "Saudi Arabia"
## [125] "Syria" "United Arab Emirates"
## [127] "Yemen" "Africa"
## [129] "Algeria" "Angola"
## [131] "Benin" "Botswana"
## [133] "Burkina Faso" "Burundi"
## [135] "Cameroon" "Cape Verde"
## [137] "Central African Republic" "Chad"
## [139] "Comoros" "Congo (Brazzaville)"
## [141] "Congo (Kinshasa)" "Cote dIvoire (IvoryCoast)"
## [143] "Djibouti" "Egypt"
## [145] "Equatorial Guinea" "Eritrea"
## [147] "Ethiopia" "Gabon"
## [149] "Gambia, The" "Ghana"
## [151] "Guinea" "Guinea-Bissau"
## [153] "Kenya" "Lesotho"
## [155] "Liberia" "Libya"
## [157] "Madagascar" "Malawi"
## [159] "Mali" "Mauritania"
## [161] "Mauritius" "Morocco"
## [163] "Mozambique" "Namibia"
## [165] "Niger" "Nigeria"
## [167] "Reunion" "Rwanda"
## [169] "Saint Helena" "Sao Tome and Principe"
## [171] "Senegal" "Seychelles"
## [173] "Sierra Leone" "Somalia"
## [175] "South Africa" "Sudan"
## [177] "Swaziland" "Tanzania"
## [179] "Togo" "Tunisia"
## [181] "Uganda" "Western Sahara"
## [183] "Zambia" "Zimbabwe"
## [185] "Asia & Oceania" "Afghanistan"
## [187] "American Samoa" "Australia"
## [189] "Bangladesh" "Bhutan"
## [191] "Brunei" "Burma (Myanmar)"
## [193] "Cambodia" "China"
## [195] "Cook Islands" "Fiji"
## [197] "French Polynesia" "Guam"
## [199] "Hawaiian Trade Zone" "Hong Kong"
## [201] "India" "Indonesia"
## [203] "Japan" "Kiribati"
## [205] "Korea, North" "Korea, South"
## [207] "Laos" "Macau"
## [209] "Malaysia" "Maldives"
## [211] "Mongolia" "Nauru"
## [213] "Nepal" "New Caledonia"
## [215] "New Zealand" "Niue"
## [217] "Pakistan" "Papua New Guinea"
## [219] "Philippines" "Samoa"
## [221] "Singapore" "Solomon Islands"
## [223] "Sri Lanka" "Taiwan"
## [225] "Thailand" "Timor-Leste (East Timor)"
## [227] "Tonga" "U.S. Pacific Islands"
## [229] "Vanuatu" "Vietnam"
## [231] "Wake Island" "World"
# Create a vector of "noncountry" values that appear in the region variable
noncountries <- c("North America", "Central & South America", "Antarctica", "Europe", "Eurasia",
"Middle East", "Africa", "Asia & Oceania", "World")
# create a tibble of regional values
coal_region <- coal_long %>% filter(region %in% noncountries)
# create a tibble of country values
coal_country <- coal_long %>% filter(!(region %in% noncountries))
# check them out
unique(coal_region$region)
## [1] "North America" "Central & South America"
## [3] "Antarctica" "Europe"
## [5] "Eurasia" "Middle East"
## [7] "Africa" "Asia & Oceania"
## [9] "World"
unique(coal_country$region)
## [1] "Bermuda" "Canada"
## [3] "Greenland" "Mexico"
## [5] "Saint Pierre and Miquelon" "United States"
## [7] "Antigua and Barbuda" "Argentina"
## [9] "Aruba" "Bahamas, The"
## [11] "Barbados" "Belize"
## [13] "Bolivia" "Brazil"
## [15] "Cayman Islands" "Chile"
## [17] "Colombia" "Costa Rica"
## [19] "Cuba" "Dominica"
## [21] "Dominican Republic" "Ecuador"
## [23] "El Salvador" "Falkland Islands (Islas Malvinas)"
## [25] "French Guiana" "Grenada"
## [27] "Guadeloupe" "Guatemala"
## [29] "Guyana" "Haiti"
## [31] "Honduras" "Jamaica"
## [33] "Martinique" "Montserrat"
## [35] "Netherlands Antilles" "Nicaragua"
## [37] "Panama" "Paraguay"
## [39] "Peru" "Puerto Rico"
## [41] "Saint Kitts and Nevis" "Saint Lucia"
## [43] "Saint Vincent/Grenadines" "Suriname"
## [45] "Trinidad and Tobago" "Turks and Caicos Islands"
## [47] "Uruguay" "Venezuela"
## [49] "Virgin Islands, U.S." "Virgin Islands, British"
## [51] "Albania" "Austria"
## [53] "Belgium" "Bosnia and Herzegovina"
## [55] "Bulgaria" "Croatia"
## [57] "Cyprus" "Czech Republic"
## [59] "Denmark" "Faroe Islands"
## [61] "Finland" "Former Czechoslovakia"
## [63] "Former Serbia and Montenegro" "Former Yugoslavia"
## [65] "France" "Germany"
## [67] "Germany, East" "Germany, West"
## [69] "Gibraltar" "Greece"
## [71] "Hungary" "Iceland"
## [73] "Ireland" "Italy"
## [75] "Luxembourg" "Macedonia"
## [77] "Malta" "Montenegro"
## [79] "Netherlands" "Norway"
## [81] "Poland" "Portugal"
## [83] "Romania" "Serbia"
## [85] "Slovakia" "Slovenia"
## [87] "Spain" "Sweden"
## [89] "Switzerland" "Turkey"
## [91] "United Kingdom" "Armenia"
## [93] "Azerbaijan" "Belarus"
## [95] "Estonia" "Former U.S.S.R."
## [97] "Georgia" "Kazakhstan"
## [99] "Kyrgyzstan" "Latvia"
## [101] "Lithuania" "Moldova"
## [103] "Russia" "Tajikistan"
## [105] "Turkmenistan" "Ukraine"
## [107] "Uzbekistan" "Bahrain"
## [109] "Iran" "Iraq"
## [111] "Israel" "Jordan"
## [113] "Kuwait" "Lebanon"
## [115] "Oman" "Palestine"
## [117] "Qatar" "Saudi Arabia"
## [119] "Syria" "United Arab Emirates"
## [121] "Yemen" "Algeria"
## [123] "Angola" "Benin"
## [125] "Botswana" "Burkina Faso"
## [127] "Burundi" "Cameroon"
## [129] "Cape Verde" "Central African Republic"
## [131] "Chad" "Comoros"
## [133] "Congo (Brazzaville)" "Congo (Kinshasa)"
## [135] "Cote dIvoire (IvoryCoast)" "Djibouti"
## [137] "Egypt" "Equatorial Guinea"
## [139] "Eritrea" "Ethiopia"
## [141] "Gabon" "Gambia, The"
## [143] "Ghana" "Guinea"
## [145] "Guinea-Bissau" "Kenya"
## [147] "Lesotho" "Liberia"
## [149] "Libya" "Madagascar"
## [151] "Malawi" "Mali"
## [153] "Mauritania" "Mauritius"
## [155] "Morocco" "Mozambique"
## [157] "Namibia" "Niger"
## [159] "Nigeria" "Reunion"
## [161] "Rwanda" "Saint Helena"
## [163] "Sao Tome and Principe" "Senegal"
## [165] "Seychelles" "Sierra Leone"
## [167] "Somalia" "South Africa"
## [169] "Sudan" "Swaziland"
## [171] "Tanzania" "Togo"
## [173] "Tunisia" "Uganda"
## [175] "Western Sahara" "Zambia"
## [177] "Zimbabwe" "Afghanistan"
## [179] "American Samoa" "Australia"
## [181] "Bangladesh" "Bhutan"
## [183] "Brunei" "Burma (Myanmar)"
## [185] "Cambodia" "China"
## [187] "Cook Islands" "Fiji"
## [189] "French Polynesia" "Guam"
## [191] "Hawaiian Trade Zone" "Hong Kong"
## [193] "India" "Indonesia"
## [195] "Japan" "Kiribati"
## [197] "Korea, North" "Korea, South"
## [199] "Laos" "Macau"
## [201] "Malaysia" "Maldives"
## [203] "Mongolia" "Nauru"
## [205] "Nepal" "New Caledonia"
## [207] "New Zealand" "Niue"
## [209] "Pakistan" "Papua New Guinea"
## [211] "Philippines" "Samoa"
## [213] "Singapore" "Solomon Islands"
## [215] "Sri Lanka" "Taiwan"
## [217] "Thailand" "Timor-Leste (East Timor)"
## [219] "Tonga" "U.S. Pacific Islands"
## [221] "Vanuatu" "Vietnam"
## [223] "Wake Island"