clean - explore - tidy - prepare
# when data cleaning - we have 3 steps:
# NOT remove/replace missing data
# 1 explore ----------------------------
## sturc
### class()
### str()
#### library(dplyr) --> glimpse()
#### glimpse() from dplyr = cleaner alternative to str()
### summary()
## look
### dim()
### names()
### head()
### tail()
### print() - for small dataset
## visualise
### hist(df$variable)
#### e.g. hist((bmi$Y2008))
### plot() - scatterplot for 2 variables
#### e.g. plot(bmi$Y1980,bmi$Y2008)
# 2 tidy ----------------------------
## Each observation forms a row
## each variable forms a column
## each type of observational unit forms a table
## Each value belongs to a variable and an observation
## A dataset is a collection of values
## NOT TURE: A variable contains all values measured
## on the same unit across attributes
## Messy Data
## Variables are stored in both rows and columns
## Column headers are values, not variable names
## A single observational unit is stored in multiple tables
## Multiple variables stored in one column
## NOT MESSY DATA: Multiple values are stored in one column
## data:
## http://wwwf.imperial.ac.uk/medicine/apps/ezzati/metabolic_risks/bmi/
## gather(wide_df, my_key, my_val, -col)
## makes wide datasets long
## useful if: columns that are not variables
## want to: collapse them into key-value pairs
col <- c('X','Y')
A <- c(1,2)
B <- c(3,4)
C <- c(5,6)
wide_df <- data.frame(col, A, B, C)
wide_df # as per example
## col A B C
## 1 X 1 3 5
## 2 Y 2 4 6
library(tidyr)
# gather the my_key columns
long_df <- gather(wide_df, my_key, my_val, -col)
# note the lack of the use of quotes
long_df
## col my_key my_val
## 1 X A 1
## 2 Y A 2
## 3 X B 3
## 4 Y B 4
## 5 X C 5
## 6 Y C 6
# key-values pairs --> spreads them across multiple columns
# e.g. values in a column
# should actually be column names (i.e. variables).
spread(long_df, my_key, my_val)
## col A B C
## 1 X 1 3 5
## 2 Y 2 4 6
patient <- rep(c('X','Y'),3)
treatment <- rep(c('A','B'),3)
year_mo <- c(rep(c('2010-10'),2),
rep(c('2012-08'),2),
rep(c('2014-12'),2)
)
response <- c(1,4,2,5,3,6)
treatments <- data.frame(patient, treatment, year_mo, response)
treatments # as per example
## patient treatment year_mo response
## 1 X A 2010-10 1
## 2 Y B 2010-10 4
## 3 X A 2012-08 2
## 4 Y B 2012-08 5
## 5 X A 2014-12 3
## 6 Y B 2014-12 6
# separate one column into multiple columns
# the sep argument
# any character <> letter or number
# or specify a specific separator
treatments_sep <- separate(treatments,
col = year_mo,
into = c("year", "month"))
# sep = "/"
treatments_sep
## patient treatment year month response
## 1 X A 2010 10 1
## 2 Y B 2010 10 4
## 3 X A 2012 08 2
## 4 Y B 2012 08 5
## 5 X A 2014 12 3
## 6 Y B 2014 12 6
# paste columns together
treatments_uni <- unite(treatments_sep, year_mo, year, month)
treatments_uni
## patient treatment year_mo response
## 1 X A 2010_10 1
## 2 Y B 2010_10 4
## 3 X A 2012_08 2
## 4 Y B 2012_08 5
## 5 X A 2014_12 3
## 6 Y B 2014_12 6
# 3 prepare ----------------------------
# Types of Variables in R
# Make this evaluate to character
# class(true)
# Error: object 'true' not found
class("true")
## [1] "character"
# Make this evaluate to numeric
class("8484.00")
## [1] "character"
class(8484.00)
## [1] "numeric"
# Make this evaluate to integer
class(99)
## [1] "numeric"
class(99L)
## [1] "integer"
# Make this evaluate to factor
class("factor")
## [1] "character"
class(as.factor("factor"))
## [1] "factor"
# Make this evaluate to logical
class("FALSE")
## [1] "character"
class(FALSE)
## [1] "logical"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
ymd("2015-08-25")
## [1] "2015-08-25"
mdy("August 25, 2015")
## [1] "2015-08-25"
dmy("17 Sep 2015")
## [1] "2015-09-17"
mdy_hm("July 15, 2012 12:56")
## [1] "2012-07-15 12:56:00 UTC"
# type conversions = coerce
# as.character()
# as.factor()
library(stringr)
str_trim(" this is a test ")
## [1] "this is a test"
str_pad("24493", width = 7, side = "left", pad = "0")
## [1] "0024493"
# Trim all leading and trailing whitespace
str_trim(c(" Filip ", "Nick ", " Jonathan"))
## [1] "Filip" "Nick" "Jonathan"
# Pad these strings with leading zeros
str_pad(c("23485W", "8823453Q", "994Z"), width = max(length(c("23485W", "8823453Q", "994Z"))), side = "left", pad = "0")
## [1] "23485W" "8823453Q" "994Z"
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")
## [1] "00023485W" "08823453Q" "00000994Z"
states <- c("al","ak","az","ar","ca","co","ct","de","fl","ga","hi","id","il","in","ia"
,"ks","ky","la","me","md","ma","mi","mn","ms","mo","mt","ne","nv","nh","nj"
,"nm","ny","nc","nd","oh","ok","or","pa","ri","sc","sd","tn","tx","ut","vt"
,"va","wa","wv","wi","wy")
states_upper <- toupper(states)
tolower(states_upper)
## [1] "al" "ak" "az" "ar" "ca" "co" "ct" "de" "fl" "ga" "hi" "id" "il" "in"
## [15] "ia" "ks" "ky" "la" "me" "md" "ma" "mi" "mn" "ms" "mo" "mt" "ne" "nv"
## [29] "nh" "nj" "nm" "ny" "nc" "nd" "oh" "ok" "or" "pa" "ri" "sc" "sd" "tn"
## [43] "tx" "ut" "vt" "va" "wa" "wv" "wi" "wy"
#str_detect()
#str_detect(students2$dob, "1997")
#str_replace()
#str_replace(students2$sex, "F", "Female")
# 4 missing values ----------------------------
name <- c('status','Tom','David','Alice')
n_friends <- c(244,NA,145,43)
status <- c("Going out!!","","Movie night...","")
social_df <- data.frame(name, n_friends, status)
social_df
## name n_friends status
## 1 status 244 Going out!!
## 2 Tom NA
## 3 David 145 Movie night...
## 4 Alice 43
is.na(social_df)
## name n_friends status
## [1,] FALSE FALSE FALSE
## [2,] FALSE TRUE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
# Are there any NA values in my dataset?
any(is.na(social_df))
## [1] TRUE
summary(social_df)
## name n_friends status
## Alice :1 Min. : 43.0 :2
## David :1 1st Qu.: 94.0 Going out!! :1
## status:1 Median :145.0 Movie night...:1
## Tom :1 Mean :144.0
## 3rd Qu.:194.5
## Max. :244.0
## NA's :1
# odd values?
table(social_df$status)
##
## Going out!! Movie night...
## 2 1 1
# Replace all empty strings in status with NA
social_df$status[social_df$status == ""] <- NA
social_df
## name n_friends status
## 1 status 244 Going out!!
## 2 Tom NA <NA>
## 3 David 145 Movie night...
## 4 Alice 43 <NA>
# which rows have NO missing values?
complete.cases(social_df)
## [1] TRUE FALSE TRUE FALSE
# remove all rows with any missing values
na.omit(social_df)
## name n_friends status
## 1 status 244 Going out!!
## 3 David 145 Movie night...
# outliers?
# - valid / abnormal
# - variability in measurements
# - experimental error
# - data entry error
# to find: look at:
# hist(df,breaks = 20)
# boxplot()
# force values of zero
# to be bucketed
# to the right of zero on the x-axis
# hist(students3$absences, right = FALSE)
# ------------------------------------------------
# Chapter 4
# class() - Class of data object
# dim() - Dimensions of data
# names() - Column names
# str() - Preview of data with helpful details
# glimpse() - Better version of str() from dplyr
# summary() - Summary of data
# # Verify that weather is a data.frame
# class(weather)
#
# # Check the dimensions
# dim(weather)
#
# # View the column names
# names(weather)
#
# # View the structure of the data
# str(weather)
#
# # Load dplyr package
# library(dplyr)
#
# # Look at the structure using dplyr's glimpse()
# glimpse(weather)
#
# # View a summary of the data
# summary(weather)
#
# # View first 6 rows
# head(weather, 6)
#
# # View first 15 rows
# head(weather, 15)
#
# # View the last 6 rows
# tail(weather, 6)
#
# # View the last 10 rows
# tail(weather, 10)
#
# # Load the tidyr package
# library(tidyr)
# # str(weather)
# # 'data.frame': 286 obs. of 35 variables:
# # $ X : int 1 2 3 4 5 6 7 8 9 10 ...
# # $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# # $ month : int 12 12 12 12 12 12 12 12 12 12 ...
# # $ measure: chr "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# # $ X1 : chr "64" "52" "39" "46" ...
# # $ X2 : chr "42" "38" "33" "40" ...
# # $ X3 : chr "51" "44" "37" "49" ...
# # $ X4 : chr "43" "37" "30" "24" ...
#
# # column names X1-X31 represent days of the month,
# # which should really be values of a new variable called day.
# # Gather the columns
# weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)
#
# # View the head
# head(weather2)
# # X year month measure day value
# # 1 1 2014 12 Max.TemperatureF X1 64
# # 2 2 2014 12 Mean.TemperatureF X1 52
# # 3 3 2014 12 Min.TemperatureF X1 39
# # 4 4 2014 12 Max.Dew.PointF X1 46
# # 5 5 2014 12 MeanDew.PointF X1 40
# # 6 6 2014 12 Min.DewpointF X1 26
#
# ## The tidyr package is already loaded
# # str(weather2)
# # 'data.frame': 8046 obs. of 6 variables:
# # $ X : int 1 2 3 4 5 6 7 8 9 10 ...
# # $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# # $ month : int 12 12 12 12 12 12 12 12 12 12 ...
# # $ measure: chr "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# # $ day : Factor w/ 31 levels "X1","X2","X3",..: 1 1 1 1 1 1 1 1 1 1 ...
# # $ value : chr "64" "52" "39" "46" ...
#
# # First remove column of row names
# weather2 <- weather2[, -1]
# # 'data.frame': 8046 obs. of 5 variables:
# # $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# # $ month : int 12 12 12 12 12 12 12 12 12 12 ...
# # $ measure: chr "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# # $ day : Factor w/ 31 levels "X1","X2","X3",..: 1 1 1 1 1 1 1 1 1 1 ...
# # $ value : chr "64" "52" "39" "46" ...
#
# # Spread the data
# weather3 <- spread(weather2,measure,value)
#
# # View the head
# head(weather3)
# # 'data.frame': 366 obs. of 25 variables:
# # $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# # $ month : int 12 12 12 12 12 12 12 12 12 12 ...
# # $ day : Factor w/ 31 levels "X1","X2","X3",..: 1 2 3 4 5 6 7 8 9 10 ...
# # $ CloudCover : chr "6" "7" "8" "3" ...
# # $ Events : chr "Rain" "Rain-Snow" "Rain" "" ...
# # $ Max.Dew.PointF : chr "46" "40" "49" "24" ...
# # $ Max.Gust.SpeedMPH : chr "29" "29" "38" "33" ...
# # $ Max.Humidity : chr "74" "92" "100" "69" ...
#
# -- Prepare --
#
# ## tidyr and dplyr are already loaded
#
# # Load the stringr and lubridate packages
# library(stringr)
# library(lubridate)
#
# # Remove X's from day column
# weather3$day <- str_replace(weather3$day,"X","")
#
# # Unite the year, month, and day columns
# weather4 <- unite(weather3, date, year, month, day, sep = "-")
#
# # Convert date column to proper date format using lubridates's ymd()
# weather4$date <- ymd(weather4$date)
#
# # Rearrange columns using dplyr's select()
# weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
#
# # View the head
# head(weather5)
#
# # View the structure of weather5
# str(weather5)
#
# # Examine the first 20 rows of weather5. Are most of the characters numeric?
# head(weather5,20)
#
# # See what happens if we try to convert PrecipitationIn to numeric
# as.numeric(weather5$PrecipitationIn)
# # Warning message: NAs introduced by coercion
#
# ## The dplyr and stringr packages are already loaded
#
# # Replace T with 0 (T = trace)
# weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn,"T",0)
#
# # Convert characters to numerics
# # Run the call
# # to mutate_each as-is to conveniently
# # apply as.numeric() to all columns
# # from CloudCover through WindDirDegrees
# # (reading left to right in the data),
# weather6 <- mutate_each(weather5, funs(as.numeric), CloudCover:WindDirDegrees)
#
# # Look at result
# str(weather6)
#
# # Count missing values
# sum(is.na(weather6))
# # 6
# # Find missing values
# summary(weather6)
#
# # Find indices of NAs in Max.Gust.SpeedMPH
# ind <- which(is.na(weather6$Max.Gust.SpeedMPH))
# # [1] 169 185 251 275 316 338
#
# # Look at the full rows for records missing Max.Gust.SpeedMPH
# weather6[ind, ]
#
# # # obvious error...
#
# # Review distributions for all variables
# summary(weather6)
#
# # Find row with Max.Humidity of 1000
# ind <- which(weather6$Max.Humidity == 1000)
#
# # Look at the data for that day
# weather6[ind, ]
#
# # Change 1000 to 100
# weather6$Max.Humidity[ind] <- 100
#
# # Look at summary of Mean.VisibilityMiles
# summary(weather6$Mean.VisibilityMiles)
#
# # Get index of row with -1 value
# ind <- which(weather6$Mean.VisibilityMiles == -1)
#
# # Look at full row
# weather6[ind,]
#
# # Set Mean.VisibilityMiles to the appropriate value
# weather6$Mean.VisibilityMiles[ind] <- 10
#
# # Clean up column names
# names(weather6) <- new_colnames
#
# # Replace empty cells in events column
# weather6$events[weather6$events == ""] <- "None"
#
# # Print the first 6 rows of weather6
# head(weather6,6)