Data Wrangling in R

### Tible from data set
tibb <- as_tibble(CO2)
tibb

## # A tibble: 84 x 5
##    Plant Type   Treatment   conc uptake
##    <ord> <fct>  <fct>      <dbl>  <dbl>
##  1 Qn1   Quebec nonchilled    95   16  
##  2 Qn1   Quebec nonchilled   175   30.4
##  3 Qn1   Quebec nonchilled   250   34.8
##  4 Qn1   Quebec nonchilled   350   37.2
##  5 Qn1   Quebec nonchilled   500   35.3
##  6 Qn1   Quebec nonchilled   675   39.2
##  7 Qn1   Quebec nonchilled  1000   39.7
##  8 Qn2   Quebec nonchilled    95   13.6
##  9 Qn2   Quebec nonchilled   175   27.3
## 10 Qn2   Quebec nonchilled   250   37.1
## # ... with 74 more rows

### Tibble from scratch
name <- c("Karol", "Matt", "Adam", "Santi")
birthyear <- c(2000, 2001,2002, 2003)
eyes <- c("blue", "black", "green", "brown")

people <- tibble(name, birthyear, eyes)
people

## # A tibble: 4 x 3
##   name  birthyear eyes 
##   <chr>     <dbl> <chr>
## 1 Karol      2000 blue 
## 2 Matt       2001 black
## 3 Adam       2002 green
## 4 Santi      2003 brown

Subsetting

Extracts a single variable as a vector

people$eyes

## [1] "blue"  "black" "green" "brown"

unique(people$eyes)

## [1] "blue"  "black" "green" "brown"

## Operator
people[["eyes"]]

## [1] "blue"  "black" "green" "brown"

people[[3]]

## [1] "blue"  "black" "green" "brown"

CSV files

## CSV files

inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv")
## more info ?read_csv


names<- c("ID", "DBAName", "AkaName", "License", "Facilitytype", "Risk", "Address", "City", "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations", "Latitude", "Longitude", "Location")

inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv",
                        col.names = names)

glimpse(inspections)

## Rows: 145,606
## Columns: 17
## $ ID             <int> 1995829, 1995817, 1995822, 1995814, 1995811, 1995752, 1~
## $ DBAName        <chr> "SUBWAY", "VENEZUELAN BOWL GRILL LLC", "SEVEN TEN", "CH~
## $ AkaName        <chr> "SUBWAY", "VENEZUELAN BOWL GRILL", "SEVEN TEN", "CHISME~
## $ License        <int> 1679112, 2424110, 1172093, 1334960, 2334190, 2517730, 2~
## $ Facilitytype   <chr> "Restaurant", "Restaurant", "Restaurant", "Restaurant",~
## $ Risk           <chr> "Risk 1 (High)", "Risk 1 (High)", "Risk 1 (High)", "Ris~
## $ Address        <chr> "8711 S ASHLAND AVE ", "2436 N LINCOLN AVE ", "1055 E 5~
## $ City           <chr> "CHICAGO", "CHICAGO", "CHICAGO", "CHICAGO", "CHICAGO", ~
## $ State          <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "IL", "~
## $ Zip            <int> 60620, 60614, 60615, 60629, 60615, 60621, 60630, 60614,~
## $ InspectionDate <chr> "03/17/2017", "03/17/2017", "03/17/2017", "03/17/2017",~
## $ InspectionType <chr> "Canvass Re-Inspection", "Canvass Re-Inspection", "Canv~
## $ Results        <chr> "Pass", "Pass", "Pass", "Pass", "Pass", "Pass", "Out of~
## $ Violations     <chr> "19. OUTSIDE GARBAGE WASTE GREASE AND STORAGE AREA; CLE~
## $ Latitude       <dbl> 41.73541, 41.92619, 41.79492, 41.78429, 41.79517, 41.77~
## $ Longitude      <dbl> -87.66291, -87.65002, -87.59985, -87.72277, -87.59710, ~
## $ Location       <chr> "(41.735408702583165, -87.66290968424336)", "(41.926190~

TSV (Tab-separated Values) files

inpatient <- read_tsv("C:/Users/karolo/Desktop/data/inpatient.tsv")

## Rows: 163065 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (10): DRG Definition, Provider Name, Provider Street Address, Provider C...
## dbl  (2): Provider Id, Total Discharges

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

names_i <- c("DRG", "ProviderID", "Name", "Address", "City", "State", "Zip", "Region", "Discharges", "AverageCharges", "AverageTotalPayments", "AverageMedicarePayments")

inpatient <- read.delim("C:/Users/karolo/Desktop/data/inpatient.tsv",
                        col.names = names_i)



inpatient$Discharges <- as.integer(inpatient$Discharges)
inpatient$ProviderID <- as.character(inpatient$ProviderID)
inpatient$Zip <- as.character(inpatient$Zip)
  
glimpse(inpatient)

## Rows: 163,065
## Columns: 12
## $ DRG                     <chr> "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC", "0~
## $ ProviderID              <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name                    <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address                 <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City                    <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State                   <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip                     <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region                  <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges              <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges          <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments    <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~

Delimited Files (txt)

stoppages <- read_delim("C:/Users/karolo/Desktop/data/workstoppages.txt", delim = "^")

## Rows: 434 Columns: 3

## -- Column specification --------------------------------------------------------
## Delimiter: "^"
## chr (1): period
## dbl (2): year, workers

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(stoppages)

## Rows: 434
## Columns: 3
## $ year    <dbl> 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 1981, 19~
## $ period  <chr> "January", "February", "March", "April", "May", "June", "July"~
## $ workers <dbl> 29600, 20900, 207800, 223500, 259000, 415100, 125400, 86600, 6~

The readxl package

## The readxl package isn't part of the core tidyverse so we need to load it separately
library(readxl) 
####  Try just reading the file without other arguments

breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx")

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...

glimpse(breakfast)

## Rows: 46
## Columns: 7
## $ `School Breakfast Program Participation and Meals Served` <chr> "(Data as of~
## $ ...2                                                      <chr> NA, "      -~
## $ ...3                                                      <chr> NA, NA, "Red~
## $ ...4                                                      <chr> NA, NA, "Pai~
## $ ...5                                                      <chr> NA, NA, "Tot~
## $ ...6                                                      <chr> NA, NA, "Mea~
## $ ...7                                                      <chr> NA, NA, "Fre~

##  Try skipping three lines

breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx", skip=3)
glimpse(breakfast)

## Rows: 43
## Columns: 7
## $ `Fiscal Years `            <chr> "    ", "1975", "1976", "1977", "1978", "19~
## $ Free                       <chr> "------------------------------Millions----~
## $ `Red. Price `              <dbl> NA, 0.04, 0.06, 0.11, 0.16, 0.21, 0.25, 0.2~
## $ Paid                       <dbl> NA, 0.33, 0.37, 0.36, 0.42, 0.54, 0.56, 0.5~
## $ `Total `                   <dbl> NA, 1.82, 2.20, 2.49, 2.80, 3.32, 3.60, 3.8~
## $ `Meals Served`             <dbl> NA, 294.70, 353.60, 434.30, 478.80, 565.60,~
## $ `Free/ RP of Total Meals ` <chr> "--Percent--", "82.1", "84.2", "85.7", "85.~

# It looks like we need to skip five lines, which will remove the column names
# So lets create a vector with column names
names <- c("Year", "FreeStudents", "ReducedStudents", "PaidStudents", "TotalStudents", 
           "MealsServed", "PercentFree")

# And then try reading the file again
breakfast <- read_excel("C:/Users/karolo/Desktop/data/breakfast.xlsx",skip=5, col_names = names)
glimpse(breakfast)

## Rows: 42
## Columns: 7
## $ Year            <dbl> 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, ~
## $ FreeStudents    <dbl> 1.45, 1.76, 2.02, 2.23, 2.56, 2.79, 3.05, 2.80, 2.87, ~
## $ ReducedStudents <dbl> 0.04, 0.06, 0.11, 0.16, 0.21, 0.25, 0.25, 0.16, 0.15, ~
## $ PaidStudents    <dbl> 0.33, 0.37, 0.36, 0.42, 0.54, 0.56, 0.51, 0.36, 0.34, ~
## $ TotalStudents   <dbl> 1.82, 2.20, 2.49, 2.80, 3.32, 3.60, 3.81, 3.32, 3.36, ~
## $ MealsServed     <dbl> 294.70, 353.60, 434.30, 478.80, 565.60, 619.90, 644.20~
## $ PercentFree     <dbl> 82.1, 84.2, 85.7, 85.3, 84.1, 85.2, 86.9, 89.3, 90.3, ~

# I'll do a little quick manipulation of this tibble. 
# First, convert the numbers of students and meals to real values
breakfast <- breakfast %>%
  mutate(FreeStudents=FreeStudents*1000000,
         ReducedStudents=ReducedStudents * 1000000,
         PaidStudents = PaidStudents * 1000000,
         TotalStudents = TotalStudents * 1000000,
         MealsServed = MealsServed * 1000000,
         PercentFree = PercentFree/100)

glimpse(breakfast)

## Rows: 42
## Columns: 7
## $ Year            <dbl> 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, ~
## $ FreeStudents    <dbl> 1450000, 1760000, 2020000, 2230000, 2560000, 2790000, ~
## $ ReducedStudents <dbl> 40000, 60000, 110000, 160000, 210000, 250000, 250000, ~
## $ PaidStudents    <dbl> 330000, 370000, 360000, 420000, 540000, 560000, 510000~
## $ TotalStudents   <dbl> 1820000, 2200000, 2490000, 2800000, 3320000, 3600000, ~
## $ MealsServed     <dbl> 294700000, 353600000, 434300000, 478800000, 565600000,~
## $ PercentFree     <dbl> 0.821, 0.842, 0.857, 0.853, 0.841, 0.852, 0.869, 0.893~

Data Transformation

Making long datasets

pew <- read.csv("C:/Users/karolo/Desktop/data/pew.csv")

pew_long <- pivot_longer(pew, !religion, names_to = "Income", values_to = "freq")

head(pew_long)

## # A tibble: 6 x 3
##   religion Income    freq
##   <chr>    <chr>    <int>
## 1 Agnostic X..10k      27
## 2 Agnostic X.10.20k    34
## 3 Agnostic X.20.30k    60
## 4 Agnostic X.30.40k    81
## 5 Agnostic X.40.50k    76
## 6 Agnostic X.50.75k   137

Making wide datasets

weather <- read.csv("C:/Users/karolo/Desktop/data/mexicanweather.csv")
head(weather)

##       station element value       date
## 1 MX000017004    TMAX   310 1955-04-01
## 2 MX000017004    TMIN   150 1955-04-01
## 3 MX000017004    TMAX   310 1955-05-01
## 4 MX000017004    TMIN   200 1955-05-01
## 5 MX000017004    TMAX   300 1955-06-01
## 6 MX000017004    TMIN   160 1955-06-01

weatherwide <- pivot_wider(weather, names_from = element, values_from = value)
head(weatherwide)

## # A tibble: 6 x 4
##   station     date        TMAX  TMIN
##   <chr>       <chr>      <int> <int>
## 1 MX000017004 1955-04-01   310   150
## 2 MX000017004 1955-05-01   310   200
## 3 MX000017004 1955-06-01   300   160
## 4 MX000017004 1955-07-01   270   150
## 5 MX000017004 1955-08-01   230   140
## 6 MX000017004 1955-09-01   230   150

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## year() extracts the year from a date
## month() extracts the month value
## day() extracts the day of the month
## wday() returns the day of the week
## yday() returns the day of the year



## hour() extracts the hour from a time
## minute() extracts the minute value
## second() extracts the second value

# Date Construction Functions

## April 1, 2018, function mdy()
## 2018-04-01, ymd()
## 01-04-2018, dmy()
## April 1, 2018 04:32:16, mdy_hms
## ymd_hms, dmy_hms

weather_time <- weather %>% 
  mutate(year= year(date), month= month(date), day= day(date))
head(weather_time)

##       station element value       date year month day
## 1 MX000017004    TMAX   310 1955-04-01 1955     4   1
## 2 MX000017004    TMIN   150 1955-04-01 1955     4   1
## 3 MX000017004    TMAX   310 1955-05-01 1955     5   1
## 4 MX000017004    TMIN   200 1955-05-01 1955     5   1
## 5 MX000017004    TMAX   300 1955-06-01 1955     6   1
## 6 MX000017004    TMIN   160 1955-06-01 1955     6   1

wday("2018-04-01")

## [1] 1

mdy("04/01/22")

## [1] "2022-04-01"

names<- c("ID", "DBAName", "AkaName", "License", "Facilitytype", "Risk", "Address", "City", "State", "Zip", "InspectionDate", "InspectionType", "Results", "Violations", "Latitude", "Longitude", "Location")

inspections <- read.csv("C:/Users/karolo/Desktop/data/inspections.csv", 
                        col.names = names)


# Create a new column called Regions that combines City and State
regional_inspections <- unite(inspections,Region,City,State,sep=", ", remove=FALSE)

# And take a look at the unique regions
unique(regional_inspections$Region)

##  [1] "CHICAGO, IL"              ", IL"                    
##  [3] "ELK GROVE VILLAGE, IL"    "SCHAUMBURG, IL"          
##  [5] "Chicago, IL"              "CALUMET CITY, IL"        
##  [7] "SKOKIE, IL"               "chicago, IL"             
##  [9] "EVANSTON, IL"             "CCHICAGO, IL"            
## [11] "NILES NILES, IL"          "CHicago, IL"             
## [13] "ELMHURST, IL"             ", "                      
## [15] "SCHILLER PARK, IL"        "OAK PARK, IL"            
## [17] "BRIDEVIEW, IL"            "CHESTNUT STREET, IL"     
## [19] "BEDFORD PARK, IL"         "BANNOCKBURNDEERFIELD, IL"
## [21] "CHCICAGO, IL"             "BLOOMINGDALE, IL"        
## [23] "CICERO, IL"               "Norridge, IL"            
## [25] "CHICAGO, "                "CHARLES A HAYES, IL"     
## [27] "CHICAGOCHICAGO, IL"       "MAYWOOD, IL"             
## [29] "CHCHICAGO, IL"            "CHICAGOI, IL"            
## [31] "SUMMIT, IL"               "OOLYMPIA FIELDS, IL"     
## [33] "WESTMONT, IL"             "CHICAGO HEIGHTS, IL"     
## [35] "JUSTICE, IL"              "TINLEY PARK, IL"         
## [37] "LOMBARD, IL"              "EAST HAZEL CREST, IL"    
## [39] "COUNTRY CLUB HILLS, IL"   "STREAMWOOD, IL"          
## [41] "BOLINGBROOK, IL"          "INACTIVE, IL"            
## [43] "BERWYN, IL"               "BURNHAM, IL"             
## [45] "DES PLAINES, IL"          "LAKE ZURICH, IL"         
## [47] "OLYMPIA FIELDS, IL"       "alsip, IL"               
## [49] "OAK LAWN, IL"             "BLUE ISLAND, IL"         
## [51] "GLENCOE, IL"              "FRANKFORT, IL"           
## [53] "NAPERVILLE, IL"           "BROADVIEW, IL"           
## [55] "WORTH, IL"                "Maywood, IL"             
## [57] "ALSIP, IL"                "EVERGREEN PARK, IL"

# We need to load stringr separately
library(stringr)

# Let's handle the uppercase/lowercase issues by converting everything to uppercase
regional_inspections <- regional_inspections %>%
  mutate(Region=str_to_upper(Region))


# What were the results of that?
unique(regional_inspections$Region)

##  [1] "CHICAGO, IL"              ", IL"                    
##  [3] "ELK GROVE VILLAGE, IL"    "SCHAUMBURG, IL"          
##  [5] "CALUMET CITY, IL"         "SKOKIE, IL"              
##  [7] "EVANSTON, IL"             "CCHICAGO, IL"            
##  [9] "NILES NILES, IL"          "ELMHURST, IL"            
## [11] ", "                       "SCHILLER PARK, IL"       
## [13] "OAK PARK, IL"             "BRIDEVIEW, IL"           
## [15] "CHESTNUT STREET, IL"      "BEDFORD PARK, IL"        
## [17] "BANNOCKBURNDEERFIELD, IL" "CHCICAGO, IL"            
## [19] "BLOOMINGDALE, IL"         "CICERO, IL"              
## [21] "NORRIDGE, IL"             "CHICAGO, "               
## [23] "CHARLES A HAYES, IL"      "CHICAGOCHICAGO, IL"      
## [25] "MAYWOOD, IL"              "CHCHICAGO, IL"           
## [27] "CHICAGOI, IL"             "SUMMIT, IL"              
## [29] "OOLYMPIA FIELDS, IL"      "WESTMONT, IL"            
## [31] "CHICAGO HEIGHTS, IL"      "JUSTICE, IL"             
## [33] "TINLEY PARK, IL"          "LOMBARD, IL"             
## [35] "EAST HAZEL CREST, IL"     "COUNTRY CLUB HILLS, IL"  
## [37] "STREAMWOOD, IL"           "BOLINGBROOK, IL"         
## [39] "INACTIVE, IL"             "BERWYN, IL"              
## [41] "BURNHAM, IL"              "DES PLAINES, IL"         
## [43] "LAKE ZURICH, IL"          "OLYMPIA FIELDS, IL"      
## [45] "ALSIP, IL"                "OAK LAWN, IL"            
## [47] "BLUE ISLAND, IL"          "GLENCOE, IL"             
## [49] "FRANKFORT, IL"            "NAPERVILLE, IL"          
## [51] "BROADVIEW, IL"            "WORTH, IL"               
## [53] "EVERGREEN PARK, IL"

# Let's take care of a few misspellings of Chicago
badchicagos <- c('CCHICAGO, IL', 'CHCICAGO, IL', 'CHICAGOCHICAGO, IL', 'CHCHICAGO, IL', 'CHICAGOI, IL')

regional_inspections <- regional_inspections %>%
  mutate(Region=ifelse(Region %in% badchicagos, 'CHICAGO, IL', Region)) 
  
# And see what's left
unique(regional_inspections$Region)

##  [1] "CHICAGO, IL"              ", IL"                    
##  [3] "ELK GROVE VILLAGE, IL"    "SCHAUMBURG, IL"          
##  [5] "CALUMET CITY, IL"         "SKOKIE, IL"              
##  [7] "EVANSTON, IL"             "NILES NILES, IL"         
##  [9] "ELMHURST, IL"             ", "                      
## [11] "SCHILLER PARK, IL"        "OAK PARK, IL"            
## [13] "BRIDEVIEW, IL"            "CHESTNUT STREET, IL"     
## [15] "BEDFORD PARK, IL"         "BANNOCKBURNDEERFIELD, IL"
## [17] "BLOOMINGDALE, IL"         "CICERO, IL"              
## [19] "NORRIDGE, IL"             "CHICAGO, "               
## [21] "CHARLES A HAYES, IL"      "MAYWOOD, IL"             
## [23] "SUMMIT, IL"               "OOLYMPIA FIELDS, IL"     
## [25] "WESTMONT, IL"             "CHICAGO HEIGHTS, IL"     
## [27] "JUSTICE, IL"              "TINLEY PARK, IL"         
## [29] "LOMBARD, IL"              "EAST HAZEL CREST, IL"    
## [31] "COUNTRY CLUB HILLS, IL"   "STREAMWOOD, IL"          
## [33] "BOLINGBROOK, IL"          "INACTIVE, IL"            
## [35] "BERWYN, IL"               "BURNHAM, IL"             
## [37] "DES PLAINES, IL"          "LAKE ZURICH, IL"         
## [39] "OLYMPIA FIELDS, IL"       "ALSIP, IL"               
## [41] "OAK LAWN, IL"             "BLUE ISLAND, IL"         
## [43] "GLENCOE, IL"              "FRANKFORT, IL"           
## [45] "NAPERVILLE, IL"           "BROADVIEW, IL"           
## [47] "WORTH, IL"                "EVERGREEN PARK, IL"

# There are some "CHICAGO, NA" values that we can clearly correct to "CHICAGO, IL"
regional_inspections <- regional_inspections %>%
  mutate(Region=ifelse(Region=='CHICAGO, NA', 'CHICAGO, IL', Region)) 

# But we don't know what to do with "NA, IL", "NA, NA", or "INACTIVE, IL"
# so let's set those to missing values
nachicagos <- c('NA, IL', 'NA, NA', 'INACTIVE, IL')

regional_inspections <- regional_inspections %>%
  mutate(Region=ifelse(Region %in% nachicagos, NA, Region)) 

# How did we do?
unique(regional_inspections$Region)

##  [1] "CHICAGO, IL"              ", IL"                    
##  [3] "ELK GROVE VILLAGE, IL"    "SCHAUMBURG, IL"          
##  [5] "CALUMET CITY, IL"         "SKOKIE, IL"              
##  [7] "EVANSTON, IL"             "NILES NILES, IL"         
##  [9] "ELMHURST, IL"             ", "                      
## [11] "SCHILLER PARK, IL"        "OAK PARK, IL"            
## [13] "BRIDEVIEW, IL"            "CHESTNUT STREET, IL"     
## [15] "BEDFORD PARK, IL"         "BANNOCKBURNDEERFIELD, IL"
## [17] "BLOOMINGDALE, IL"         "CICERO, IL"              
## [19] "NORRIDGE, IL"             "CHICAGO, "               
## [21] "CHARLES A HAYES, IL"      "MAYWOOD, IL"             
## [23] "SUMMIT, IL"               "OOLYMPIA FIELDS, IL"     
## [25] "WESTMONT, IL"             "CHICAGO HEIGHTS, IL"     
## [27] "JUSTICE, IL"              "TINLEY PARK, IL"         
## [29] "LOMBARD, IL"              "EAST HAZEL CREST, IL"    
## [31] "COUNTRY CLUB HILLS, IL"   "STREAMWOOD, IL"          
## [33] "BOLINGBROOK, IL"          NA                        
## [35] "BERWYN, IL"               "BURNHAM, IL"             
## [37] "DES PLAINES, IL"          "LAKE ZURICH, IL"         
## [39] "OLYMPIA FIELDS, IL"       "ALSIP, IL"               
## [41] "OAK LAWN, IL"             "BLUE ISLAND, IL"         
## [43] "GLENCOE, IL"              "FRANKFORT, IL"           
## [45] "NAPERVILLE, IL"           "BROADVIEW, IL"           
## [47] "WORTH, IL"                "EVERGREEN PARK, IL"

# Separate at the fourth position
inpatient_separate <- separate(inpatient,DRG,c('DRGcode','DRGdescription'),4)

# And take a look at the data now
glimpse(inpatient_separate)

## Rows: 163,065
## Columns: 13
## $ DRGcode                 <chr> "039 ", "039 ", "039 ", "039 ", "039 ", "039 "~
## $ DRGdescription          <chr> "- EXTRACRANIAL PROCEDURES W/O CC/MCC", "- EXT~
## $ ProviderID              <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name                    <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address                 <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City                    <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State                   <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip                     <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region                  <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges              <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges          <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments    <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~

# Trim the DRGcode field
inpatient_separate <- inpatient_separate %>%
  mutate(DRGcode=str_trim(DRGcode))

glimpse(inpatient_separate)

## Rows: 163,065
## Columns: 13
## $ DRGcode                 <chr> "039", "039", "039", "039", "039", "039", "039~
## $ DRGdescription          <chr> "- EXTRACRANIAL PROCEDURES W/O CC/MCC", "- EXT~
## $ ProviderID              <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name                    <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address                 <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City                    <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State                   <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip                     <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region                  <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges              <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges          <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments    <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~

# The DRGdescription field has a hyphen in front so we need to do something different
inpatient_separate <- inpatient_separate %>%
  mutate(DRGdescription=str_sub(DRGdescription, 3))

glimpse(inpatient_separate)

## Rows: 163,065
## Columns: 13
## $ DRGcode                 <chr> "039", "039", "039", "039", "039", "039", "039~
## $ DRGdescription          <chr> "EXTRACRANIAL PROCEDURES W/O CC/MCC", "EXTRACR~
## $ ProviderID              <chr> "10001", "10005", "10006", "10011", "10016", "~
## $ Name                    <chr> "SOUTHEAST ALABAMA MEDICAL CENTER", "MARSHALL ~
## $ Address                 <chr> "1108 ROSS CLARK CIRCLE", "2505 U S HIGHWAY 43~
## $ City                    <chr> "DOTHAN", "BOAZ", "FLORENCE", "BIRMINGHAM", "A~
## $ State                   <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL"~
## $ Zip                     <chr> "36301", "35957", "35631", "35235", "35007", "~
## $ Region                  <chr> "AL - Dothan", "AL - Birmingham", "AL - Birmin~
## $ Discharges              <int> 91, 14, 24, 25, 18, 67, 51, 32, 135, 34, 14, 4~
## $ AverageCharges          <chr> "$32,963.07 ", "$15,131.85 ", "$37,560.37 ", "~
## $ AverageTotalPayments    <chr> "$5,777.24 ", "$5,787.57 ", "$5,434.95 ", "$5,~
## $ AverageMedicarePayments <chr> "$4,763.73 ", "$4,976.71 ", "$4,453.79 ", "$4,~

CASE STUDY: Coal Consumption

library(ggplot2)
coal <- read.csv("C:/Users/karolo/Desktop/data/coal.csv")
head(coal, n=2)

##   Table..Total.Coal.Consumption..Quadrillion.Btu.    X  X.1  X.2  X.3  X.4  X.5
## 1                                                                              
## 2                                                 1980 1981 1982 1983 1984 1985
##    X.6  X.7  X.8  X.9 X.10 X.11 X.12 X.13 X.14 X.15 X.16 X.17 X.18 X.19 X.20
## 1                                                                           
## 2 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
##   X.21 X.22 X.23 X.24 X.25 X.26 X.27 X.28 X.29
## 1                                             
## 2 2001 2002 2003 2004 2005 2006 2007 2008 2009

coal <- read.csv("C:/Users/karolo/Desktop/data/coal.csv", skip = 2)
head(coal, n=2)

##               X    X1980    X1981    X1982    X1983   X1984    X1985    X1986
## 1 North America 16.45179 16.98772 16.47546 17.12407 18.4267 18.81819 18.52559
## 2       Bermuda        0        0        0        0       0        0        0
##      X1987    X1988    X1989   X1990   X1991    X1992    X1993    X1994
## 1 19.43781 20.40363 20.62571 20.5602 20.4251 20.64672 21.28219 21.39631
## 2        0        0        0       0       0        0        0        0
##      X1995    X1996    X1997   X1998   X1999    X2000    X2001    X2002
## 1 21.64225 22.57572 23.20491 23.5002 23.4747 24.55583 23.62705 23.69876
## 2        0        0        0       0       0        0        0        0
##      X2003    X2004   X2005    X2006    X2007    X2008    X2009
## 1 24.17788 24.36024 24.6876 24.32174 24.54746 24.11993 21.14803
## 2        0        0       0        0        0        0        0

# Rename the first column as region
colnames(coal)[1] <- "region"
head(coal, n=2)

##          region    X1980    X1981    X1982    X1983   X1984    X1985    X1986
## 1 North America 16.45179 16.98772 16.47546 17.12407 18.4267 18.81819 18.52559
## 2       Bermuda        0        0        0        0       0        0        0
##      X1987    X1988    X1989   X1990   X1991    X1992    X1993    X1994
## 1 19.43781 20.40363 20.62571 20.5602 20.4251 20.64672 21.28219 21.39631
## 2        0        0        0       0       0        0        0        0
##      X1995    X1996    X1997   X1998   X1999    X2000    X2001    X2002
## 1 21.64225 22.57572 23.20491 23.5002 23.4747 24.55583 23.62705 23.69876
## 2        0        0        0       0       0        0        0        0
##      X2003    X2004   X2005    X2006    X2007    X2008    X2009
## 1 24.17788 24.36024 24.6876 24.32174 24.54746 24.11993 21.14803
## 2        0        0       0        0        0        0        0

# Convert from a wide dataset to a long dataset using pivot_longer
coal_long <- coal%>%
  pivot_longer(!region, names_to = "year", values_to = "coal_consumption")

head(coal_long)

## # A tibble: 6 x 3
##   region        year  coal_consumption
##   <chr>         <chr> <chr>           
## 1 North America X1980 16.45179        
## 2 North America X1981 16.98772        
## 3 North America X1982 16.47546        
## 4 North America X1983 17.12407        
## 5 North America X1984 18.4267         
## 6 North America X1985 18.81819

# Convert years to integers
coal_long <- coal_long %>%
  mutate(year=as.integer(year))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

summary(coal_long)

##     region               year      coal_consumption  
##  Length:6960        Min.   : NA    Length:6960       
##  Class :character   1st Qu.: NA    Class :character  
##  Mode  :character   Median : NA    Mode  :character  
##                     Mean   :NaN                      
##                     3rd Qu.: NA                      
##                     Max.   : NA                      
##                     NA's   :6960

# Convert coal consumption to numeric
coal_long <- coal_long %>%
  mutate(coal_consumption=as.numeric(coal_consumption))

## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

# Look at region values - they contain both continents and countries
unique(coal_long$region)

##   [1] "North America"                     "Bermuda"                          
##   [3] "Canada"                            "Greenland"                        
##   [5] "Mexico"                            "Saint Pierre and Miquelon"        
##   [7] "United States"                     "Central & South America"          
##   [9] "Antarctica"                        "Antigua and Barbuda"              
##  [11] "Argentina"                         "Aruba"                            
##  [13] "Bahamas, The"                      "Barbados"                         
##  [15] "Belize"                            "Bolivia"                          
##  [17] "Brazil"                            "Cayman Islands"                   
##  [19] "Chile"                             "Colombia"                         
##  [21] "Costa Rica"                        "Cuba"                             
##  [23] "Dominica"                          "Dominican Republic"               
##  [25] "Ecuador"                           "El Salvador"                      
##  [27] "Falkland Islands (Islas Malvinas)" "French Guiana"                    
##  [29] "Grenada"                           "Guadeloupe"                       
##  [31] "Guatemala"                         "Guyana"                           
##  [33] "Haiti"                             "Honduras"                         
##  [35] "Jamaica"                           "Martinique"                       
##  [37] "Montserrat"                        "Netherlands Antilles"             
##  [39] "Nicaragua"                         "Panama"                           
##  [41] "Paraguay"                          "Peru"                             
##  [43] "Puerto Rico"                       "Saint Kitts and Nevis"            
##  [45] "Saint Lucia"                       "Saint Vincent/Grenadines"         
##  [47] "Suriname"                          "Trinidad and Tobago"              
##  [49] "Turks and Caicos Islands"          "Uruguay"                          
##  [51] "Venezuela"                         "Virgin Islands,  U.S."            
##  [53] "Virgin Islands, British"           "Europe"                           
##  [55] "Albania"                           "Austria"                          
##  [57] "Belgium"                           "Bosnia and Herzegovina"           
##  [59] "Bulgaria"                          "Croatia"                          
##  [61] "Cyprus"                            "Czech Republic"                   
##  [63] "Denmark"                           "Faroe Islands"                    
##  [65] "Finland"                           "Former Czechoslovakia"            
##  [67] "Former Serbia and Montenegro"      "Former Yugoslavia"                
##  [69] "France"                            "Germany"                          
##  [71] "Germany, East"                     "Germany, West"                    
##  [73] "Gibraltar"                         "Greece"                           
##  [75] "Hungary"                           "Iceland"                          
##  [77] "Ireland"                           "Italy"                            
##  [79] "Luxembourg"                        "Macedonia"                        
##  [81] "Malta"                             "Montenegro"                       
##  [83] "Netherlands"                       "Norway"                           
##  [85] "Poland"                            "Portugal"                         
##  [87] "Romania"                           "Serbia"                           
##  [89] "Slovakia"                          "Slovenia"                         
##  [91] "Spain"                             "Sweden"                           
##  [93] "Switzerland"                       "Turkey"                           
##  [95] "United Kingdom"                    "Eurasia"                          
##  [97] "Armenia"                           "Azerbaijan"                       
##  [99] "Belarus"                           "Estonia"                          
## [101] "Former U.S.S.R."                   "Georgia"                          
## [103] "Kazakhstan"                        "Kyrgyzstan"                       
## [105] "Latvia"                            "Lithuania"                        
## [107] "Moldova"                           "Russia"                           
## [109] "Tajikistan"                        "Turkmenistan"                     
## [111] "Ukraine"                           "Uzbekistan"                       
## [113] "Middle East"                       "Bahrain"                          
## [115] "Iran"                              "Iraq"                             
## [117] "Israel"                            "Jordan"                           
## [119] "Kuwait"                            "Lebanon"                          
## [121] "Oman"                              "Palestine"                        
## [123] "Qatar"                             "Saudi Arabia"                     
## [125] "Syria"                             "United Arab Emirates"             
## [127] "Yemen"                             "Africa"                           
## [129] "Algeria"                           "Angola"                           
## [131] "Benin"                             "Botswana"                         
## [133] "Burkina Faso"                      "Burundi"                          
## [135] "Cameroon"                          "Cape Verde"                       
## [137] "Central African Republic"          "Chad"                             
## [139] "Comoros"                           "Congo (Brazzaville)"              
## [141] "Congo (Kinshasa)"                  "Cote dIvoire (IvoryCoast)"        
## [143] "Djibouti"                          "Egypt"                            
## [145] "Equatorial Guinea"                 "Eritrea"                          
## [147] "Ethiopia"                          "Gabon"                            
## [149] "Gambia, The"                       "Ghana"                            
## [151] "Guinea"                            "Guinea-Bissau"                    
## [153] "Kenya"                             "Lesotho"                          
## [155] "Liberia"                           "Libya"                            
## [157] "Madagascar"                        "Malawi"                           
## [159] "Mali"                              "Mauritania"                       
## [161] "Mauritius"                         "Morocco"                          
## [163] "Mozambique"                        "Namibia"                          
## [165] "Niger"                             "Nigeria"                          
## [167] "Reunion"                           "Rwanda"                           
## [169] "Saint Helena"                      "Sao Tome and Principe"            
## [171] "Senegal"                           "Seychelles"                       
## [173] "Sierra Leone"                      "Somalia"                          
## [175] "South Africa"                      "Sudan"                            
## [177] "Swaziland"                         "Tanzania"                         
## [179] "Togo"                              "Tunisia"                          
## [181] "Uganda"                            "Western Sahara"                   
## [183] "Zambia"                            "Zimbabwe"                         
## [185] "Asia & Oceania"                    "Afghanistan"                      
## [187] "American Samoa"                    "Australia"                        
## [189] "Bangladesh"                        "Bhutan"                           
## [191] "Brunei"                            "Burma (Myanmar)"                  
## [193] "Cambodia"                          "China"                            
## [195] "Cook Islands"                      "Fiji"                             
## [197] "French Polynesia"                  "Guam"                             
## [199] "Hawaiian Trade Zone"               "Hong Kong"                        
## [201] "India"                             "Indonesia"                        
## [203] "Japan"                             "Kiribati"                         
## [205] "Korea, North"                      "Korea, South"                     
## [207] "Laos"                              "Macau"                            
## [209] "Malaysia"                          "Maldives"                         
## [211] "Mongolia"                          "Nauru"                            
## [213] "Nepal"                             "New Caledonia"                    
## [215] "New Zealand"                       "Niue"                             
## [217] "Pakistan"                          "Papua New Guinea"                 
## [219] "Philippines"                       "Samoa"                            
## [221] "Singapore"                         "Solomon Islands"                  
## [223] "Sri Lanka"                         "Taiwan"                           
## [225] "Thailand"                          "Timor-Leste (East Timor)"         
## [227] "Tonga"                             "U.S. Pacific Islands"             
## [229] "Vanuatu"                           "Vietnam"                          
## [231] "Wake Island"                       "World"

# Create a vector of "noncountry" values that appear in the region variable
noncountries <- c("North America", "Central & South America", "Antarctica", "Europe", "Eurasia", 
                  "Middle East", "Africa", "Asia & Oceania", "World")

# create a tibble of regional values
coal_region <- coal_long %>% filter(region %in% noncountries)

# create a tibble of country values
coal_country <- coal_long %>% filter(!(region %in% noncountries))

# check them out
unique(coal_region$region)

## [1] "North America"           "Central & South America"
## [3] "Antarctica"              "Europe"                 
## [5] "Eurasia"                 "Middle East"            
## [7] "Africa"                  "Asia & Oceania"         
## [9] "World"

unique(coal_country$region)

##   [1] "Bermuda"                           "Canada"                           
##   [3] "Greenland"                         "Mexico"                           
##   [5] "Saint Pierre and Miquelon"         "United States"                    
##   [7] "Antigua and Barbuda"               "Argentina"                        
##   [9] "Aruba"                             "Bahamas, The"                     
##  [11] "Barbados"                          "Belize"                           
##  [13] "Bolivia"                           "Brazil"                           
##  [15] "Cayman Islands"                    "Chile"                            
##  [17] "Colombia"                          "Costa Rica"                       
##  [19] "Cuba"                              "Dominica"                         
##  [21] "Dominican Republic"                "Ecuador"                          
##  [23] "El Salvador"                       "Falkland Islands (Islas Malvinas)"
##  [25] "French Guiana"                     "Grenada"                          
##  [27] "Guadeloupe"                        "Guatemala"                        
##  [29] "Guyana"                            "Haiti"                            
##  [31] "Honduras"                          "Jamaica"                          
##  [33] "Martinique"                        "Montserrat"                       
##  [35] "Netherlands Antilles"              "Nicaragua"                        
##  [37] "Panama"                            "Paraguay"                         
##  [39] "Peru"                              "Puerto Rico"                      
##  [41] "Saint Kitts and Nevis"             "Saint Lucia"                      
##  [43] "Saint Vincent/Grenadines"          "Suriname"                         
##  [45] "Trinidad and Tobago"               "Turks and Caicos Islands"         
##  [47] "Uruguay"                           "Venezuela"                        
##  [49] "Virgin Islands,  U.S."             "Virgin Islands, British"          
##  [51] "Albania"                           "Austria"                          
##  [53] "Belgium"                           "Bosnia and Herzegovina"           
##  [55] "Bulgaria"                          "Croatia"                          
##  [57] "Cyprus"                            "Czech Republic"                   
##  [59] "Denmark"                           "Faroe Islands"                    
##  [61] "Finland"                           "Former Czechoslovakia"            
##  [63] "Former Serbia and Montenegro"      "Former Yugoslavia"                
##  [65] "France"                            "Germany"                          
##  [67] "Germany, East"                     "Germany, West"                    
##  [69] "Gibraltar"                         "Greece"                           
##  [71] "Hungary"                           "Iceland"                          
##  [73] "Ireland"                           "Italy"                            
##  [75] "Luxembourg"                        "Macedonia"                        
##  [77] "Malta"                             "Montenegro"                       
##  [79] "Netherlands"                       "Norway"                           
##  [81] "Poland"                            "Portugal"                         
##  [83] "Romania"                           "Serbia"                           
##  [85] "Slovakia"                          "Slovenia"                         
##  [87] "Spain"                             "Sweden"                           
##  [89] "Switzerland"                       "Turkey"                           
##  [91] "United Kingdom"                    "Armenia"                          
##  [93] "Azerbaijan"                        "Belarus"                          
##  [95] "Estonia"                           "Former U.S.S.R."                  
##  [97] "Georgia"                           "Kazakhstan"                       
##  [99] "Kyrgyzstan"                        "Latvia"                           
## [101] "Lithuania"                         "Moldova"                          
## [103] "Russia"                            "Tajikistan"                       
## [105] "Turkmenistan"                      "Ukraine"                          
## [107] "Uzbekistan"                        "Bahrain"                          
## [109] "Iran"                              "Iraq"                             
## [111] "Israel"                            "Jordan"                           
## [113] "Kuwait"                            "Lebanon"                          
## [115] "Oman"                              "Palestine"                        
## [117] "Qatar"                             "Saudi Arabia"                     
## [119] "Syria"                             "United Arab Emirates"             
## [121] "Yemen"                             "Algeria"                          
## [123] "Angola"                            "Benin"                            
## [125] "Botswana"                          "Burkina Faso"                     
## [127] "Burundi"                           "Cameroon"                         
## [129] "Cape Verde"                        "Central African Republic"         
## [131] "Chad"                              "Comoros"                          
## [133] "Congo (Brazzaville)"               "Congo (Kinshasa)"                 
## [135] "Cote dIvoire (IvoryCoast)"         "Djibouti"                         
## [137] "Egypt"                             "Equatorial Guinea"                
## [139] "Eritrea"                           "Ethiopia"                         
## [141] "Gabon"                             "Gambia, The"                      
## [143] "Ghana"                             "Guinea"                           
## [145] "Guinea-Bissau"                     "Kenya"                            
## [147] "Lesotho"                           "Liberia"                          
## [149] "Libya"                             "Madagascar"                       
## [151] "Malawi"                            "Mali"                             
## [153] "Mauritania"                        "Mauritius"                        
## [155] "Morocco"                           "Mozambique"                       
## [157] "Namibia"                           "Niger"                            
## [159] "Nigeria"                           "Reunion"                          
## [161] "Rwanda"                            "Saint Helena"                     
## [163] "Sao Tome and Principe"             "Senegal"                          
## [165] "Seychelles"                        "Sierra Leone"                     
## [167] "Somalia"                           "South Africa"                     
## [169] "Sudan"                             "Swaziland"                        
## [171] "Tanzania"                          "Togo"                             
## [173] "Tunisia"                           "Uganda"                           
## [175] "Western Sahara"                    "Zambia"                           
## [177] "Zimbabwe"                          "Afghanistan"                      
## [179] "American Samoa"                    "Australia"                        
## [181] "Bangladesh"                        "Bhutan"                           
## [183] "Brunei"                            "Burma (Myanmar)"                  
## [185] "Cambodia"                          "China"                            
## [187] "Cook Islands"                      "Fiji"                             
## [189] "French Polynesia"                  "Guam"                             
## [191] "Hawaiian Trade Zone"               "Hong Kong"                        
## [193] "India"                             "Indonesia"                        
## [195] "Japan"                             "Kiribati"                         
## [197] "Korea, North"                      "Korea, South"                     
## [199] "Laos"                              "Macau"                            
## [201] "Malaysia"                          "Maldives"                         
## [203] "Mongolia"                          "Nauru"                            
## [205] "Nepal"                             "New Caledonia"                    
## [207] "New Zealand"                       "Niue"                             
## [209] "Pakistan"                          "Papua New Guinea"                 
## [211] "Philippines"                       "Samoa"                            
## [213] "Singapore"                         "Solomon Islands"                  
## [215] "Sri Lanka"                         "Taiwan"                           
## [217] "Thailand"                          "Timor-Leste (East Timor)"         
## [219] "Tonga"                             "U.S. Pacific Islands"             
## [221] "Vanuatu"                           "Vietnam"                          
## [223] "Wake Island"

Data Wrangling in R

Karol Orozco

12/13/2021

Subsetting

CSV files

TSV (Tab-separated Values) files

Delimited Files (txt)

The readxl package

Data Transformation

Making long datasets

Making wide datasets

CASE STUDY: Coal Consumption