clean - explore - tidy - prepare

# when data cleaning - we have 3 steps:
# NOT remove/replace missing data

# 1 explore ----------------------------

## sturc 
###  class()
###  str()
####  library(dplyr) --> glimpse()
####  glimpse() from dplyr = cleaner alternative to str()
###  summary()

## look
###  dim()
###  names()
###  head()
###  tail()
###  print() - for small dataset

## visualise 
###  hist(df$variable)
####  e.g. hist((bmi$Y2008))
### plot() - scatterplot for 2 variables
####  e.g. plot(bmi$Y1980,bmi$Y2008)

# 2 tidy ----------------------------

##  Each observation forms a row
##  each variable forms a column
##  each type of observational unit forms a table
##  Each value belongs to a variable and an observation
##  A dataset is a collection of values
##  NOT TURE: A variable contains all values measured
##            on the same unit across attributes

## Messy Data
##  Variables are stored in both rows and columns
##  Column headers are values, not variable names
##  A single observational unit is stored in multiple tables
##                  Multiple variables stored in one column
##  NOT MESSY DATA: Multiple values are stored in one column

##  data:
##  http://wwwf.imperial.ac.uk/medicine/apps/ezzati/metabolic_risks/bmi/

##  gather(wide_df, my_key, my_val, -col)
##  makes wide datasets long
##  useful if: columns that are not variables 
##  want to: collapse them into key-value pairs

col <- c('X','Y')
A <- c(1,2)
B <- c(3,4)
C <- c(5,6)
wide_df <- data.frame(col, A, B, C)
wide_df # as per example
##   col A B C
## 1   X 1 3 5
## 2   Y 2 4 6

library(tidyr)
# gather the my_key columns
long_df <- gather(wide_df, my_key, my_val, -col)
# note the lack of the use of quotes
long_df
##   col my_key my_val
## 1   X      A      1
## 2   Y      A      2
## 3   X      B      3
## 4   Y      B      4
## 5   X      C      5
## 6   Y      C      6

# key-values pairs --> spreads them across multiple columns
# e.g. values in a column 
#      should actually be column names (i.e. variables).
spread(long_df, my_key, my_val)
##   col A B C
## 1   X 1 3 5
## 2   Y 2 4 6


patient <- rep(c('X','Y'),3)
treatment <- rep(c('A','B'),3)
year_mo <- c(rep(c('2010-10'),2),
             rep(c('2012-08'),2),
             rep(c('2014-12'),2)
             )
response <- c(1,4,2,5,3,6)
treatments <- data.frame(patient, treatment, year_mo, response)
treatments # as per example
##   patient treatment year_mo response
## 1       X         A 2010-10        1
## 2       Y         B 2010-10        4
## 3       X         A 2012-08        2
## 4       Y         B 2012-08        5
## 5       X         A 2014-12        3
## 6       Y         B 2014-12        6

# separate one column into multiple columns
# the sep argument
#   any character <> letter or number
#   or specify a specific separator
treatments_sep <- separate(treatments, 
                           col = year_mo, 
                           into = c("year", "month"))
                           # sep = "/"
treatments_sep
##   patient treatment year month response
## 1       X         A 2010    10        1
## 2       Y         B 2010    10        4
## 3       X         A 2012    08        2
## 4       Y         B 2012    08        5
## 5       X         A 2014    12        3
## 6       Y         B 2014    12        6

# paste columns together
treatments_uni <- unite(treatments_sep, year_mo, year, month)
treatments_uni
##   patient treatment year_mo response
## 1       X         A 2010_10        1
## 2       Y         B 2010_10        4
## 3       X         A 2012_08        2
## 4       Y         B 2012_08        5
## 5       X         A 2014_12        3
## 6       Y         B 2014_12        6


# 3 prepare ----------------------------

# Types of Variables in R

# Make this evaluate to character
  # class(true) 
  # Error: object 'true' not found
class("true")
## [1] "character"

# Make this evaluate to numeric
class("8484.00")
## [1] "character"
class(8484.00)
## [1] "numeric"

# Make this evaluate to integer
class(99)
## [1] "numeric"
class(99L)
## [1] "integer"

# Make this evaluate to factor
class("factor")
## [1] "character"
class(as.factor("factor"))
## [1] "factor"

# Make this evaluate to logical
class("FALSE")
## [1] "character"
class(FALSE)
## [1] "logical"

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
ymd("2015-08-25")
## [1] "2015-08-25"
mdy("August 25, 2015")
## [1] "2015-08-25"
dmy("17 Sep 2015")
## [1] "2015-09-17"
mdy_hm("July 15, 2012 12:56")
## [1] "2012-07-15 12:56:00 UTC"

# type conversions = coerce
# as.character()
# as.factor()

library(stringr)

str_trim("  this is a test     ")
## [1] "this is a test"
str_pad("24493", width = 7, side = "left", pad = "0")
## [1] "0024493"

# Trim all leading and trailing whitespace
str_trim(c("   Filip ", "Nick  ", " Jonathan"))
## [1] "Filip"    "Nick"     "Jonathan"

# Pad these strings with leading zeros
str_pad(c("23485W", "8823453Q", "994Z"), width = max(length(c("23485W", "8823453Q", "994Z"))), side = "left", pad = "0")
## [1] "23485W"   "8823453Q" "994Z"
str_pad(c("23485W", "8823453Q", "994Z"), width = 9, side = "left", pad = "0")
## [1] "00023485W" "08823453Q" "00000994Z"

states <- c("al","ak","az","ar","ca","co","ct","de","fl","ga","hi","id","il","in","ia"
,"ks","ky","la","me","md","ma","mi","mn","ms","mo","mt","ne","nv","nh","nj"
,"nm","ny","nc","nd","oh","ok","or","pa","ri","sc","sd","tn","tx","ut","vt"
,"va","wa","wv","wi","wy")

states_upper <- toupper(states)

tolower(states_upper)
##  [1] "al" "ak" "az" "ar" "ca" "co" "ct" "de" "fl" "ga" "hi" "id" "il" "in"
## [15] "ia" "ks" "ky" "la" "me" "md" "ma" "mi" "mn" "ms" "mo" "mt" "ne" "nv"
## [29] "nh" "nj" "nm" "ny" "nc" "nd" "oh" "ok" "or" "pa" "ri" "sc" "sd" "tn"
## [43] "tx" "ut" "vt" "va" "wa" "wv" "wi" "wy"

#str_detect()
#str_detect(students2$dob, "1997")

#str_replace()
#str_replace(students2$sex, "F", "Female")

# 4 missing values ----------------------------

name <- c('status','Tom','David','Alice')
n_friends <- c(244,NA,145,43)
status <- c("Going out!!","","Movie night...","")
social_df <- data.frame(name, n_friends, status)
social_df
##     name n_friends         status
## 1 status       244    Going out!!
## 2    Tom        NA               
## 3  David       145 Movie night...
## 4  Alice        43

is.na(social_df)
##       name n_friends status
## [1,] FALSE     FALSE  FALSE
## [2,] FALSE      TRUE  FALSE
## [3,] FALSE     FALSE  FALSE
## [4,] FALSE     FALSE  FALSE

# Are there any NA values in my dataset?
any(is.na(social_df))
## [1] TRUE

summary(social_df)
##      name     n_friends                status 
##  Alice :1   Min.   : 43.0                 :2  
##  David :1   1st Qu.: 94.0   Going out!!   :1  
##  status:1   Median :145.0   Movie night...:1  
##  Tom   :1   Mean   :144.0                     
##             3rd Qu.:194.5                     
##             Max.   :244.0                     
##             NA's   :1

# odd values?
table(social_df$status)
## 
##                   Going out!! Movie night... 
##              2              1              1

# Replace all empty strings in status with NA
social_df$status[social_df$status == ""] <- NA

social_df
##     name n_friends         status
## 1 status       244    Going out!!
## 2    Tom        NA           <NA>
## 3  David       145 Movie night...
## 4  Alice        43           <NA>

# which rows have NO missing values?
complete.cases(social_df)
## [1]  TRUE FALSE  TRUE FALSE

# remove all rows with any missing values
na.omit(social_df)
##     name n_friends         status
## 1 status       244    Going out!!
## 3  David       145 Movie night...

# outliers?
# - valid / abnormal
# - variability in measurements
# - experimental error
# - data entry error

# to find: look at: 
# hist(df,breaks = 20)
# boxplot()

# force values of zero 
# to be bucketed 
# to the right of zero on the x-axis
# hist(students3$absences, right = FALSE)





# ------------------------------------------------
  
# Chapter 4 
# class() - Class of data object
# dim() - Dimensions of data
# names() - Column names
# str() - Preview of data with helpful details
# glimpse() - Better version of str() from dplyr
# summary() - Summary of data

# # Verify that weather is a data.frame
# class(weather)
# 
# # Check the dimensions
# dim(weather)
# 
# # View the column names
# names(weather)
# 
# # View the structure of the data
# str(weather)
# 
# # Load dplyr package
# library(dplyr)
# 
# # Look at the structure using dplyr's glimpse()
# glimpse(weather)
# 
# # View a summary of the data
# summary(weather)
# 
# # View first 6 rows
# head(weather, 6)
# 
# # View first 15 rows
# head(weather, 15)
# 
# # View the last 6 rows
# tail(weather, 6)
# 
# # View the last 10 rows
# tail(weather, 10)
# 
# # Load the tidyr package
# library(tidyr)
# # str(weather)
# # 'data.frame':   286 obs. of  35 variables:
# #  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
# #  $ year   : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# #  $ month  : int  12 12 12 12 12 12 12 12 12 12 ...
# #  $ measure: chr  "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# #  $ X1     : chr  "64" "52" "39" "46" ...
# #  $ X2     : chr  "42" "38" "33" "40" ...
# #  $ X3     : chr  "51" "44" "37" "49" ...
# #  $ X4     : chr  "43" "37" "30" "24" ...
# 
# # column names X1-X31 represent days of the month, 
# # which should really be values of a new variable called day.
# # Gather the columns
# weather2 <- gather(weather, day, value, X1:X31, na.rm = TRUE)
# 
# # View the head
# head(weather2)
# #   X year month           measure day value
# # 1 1 2014    12  Max.TemperatureF  X1    64
# # 2 2 2014    12 Mean.TemperatureF  X1    52
# # 3 3 2014    12  Min.TemperatureF  X1    39
# # 4 4 2014    12    Max.Dew.PointF  X1    46
# # 5 5 2014    12    MeanDew.PointF  X1    40
# # 6 6 2014    12     Min.DewpointF  X1    26
# 
# ## The tidyr package is already loaded
# #  str(weather2)
# # 'data.frame':   8046 obs. of  6 variables:
# #  $ X      : int  1 2 3 4 5 6 7 8 9 10 ...
# #  $ year   : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# #  $ month  : int  12 12 12 12 12 12 12 12 12 12 ...
# #  $ measure: chr  "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# #  $ day    : Factor w/ 31 levels "X1","X2","X3",..: 1 1 1 1 1 1 1 1 1 1 ...
# #  $ value  : chr  "64" "52" "39" "46" ...
# 
# # First remove column of row names
# weather2 <- weather2[, -1]
# # 'data.frame':   8046 obs. of  5 variables:
# #  $ year   : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# #  $ month  : int  12 12 12 12 12 12 12 12 12 12 ...
# #  $ measure: chr  "Max.TemperatureF" "Mean.TemperatureF" "Min.TemperatureF" "Max.Dew.PointF" ...
# #  $ day    : Factor w/ 31 levels "X1","X2","X3",..: 1 1 1 1 1 1 1 1 1 1 ...
# #  $ value  : chr  "64" "52" "39" "46" ...
# 
# # Spread the data
# weather3 <- spread(weather2,measure,value)
# 
# # View the head
# head(weather3)
# # 'data.frame':   366 obs. of  25 variables:
# #  $ year                     : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
# #  $ month                    : int  12 12 12 12 12 12 12 12 12 12 ...
# #  $ day                      : Factor w/ 31 levels "X1","X2","X3",..: 1 2 3 4 5 6 7 8 9 10 ...
# #  $ CloudCover               : chr  "6" "7" "8" "3" ...
# #  $ Events                   : chr  "Rain" "Rain-Snow" "Rain" "" ...
# #  $ Max.Dew.PointF           : chr  "46" "40" "49" "24" ...
# #  $ Max.Gust.SpeedMPH        : chr  "29" "29" "38" "33" ...
# #  $ Max.Humidity             : chr  "74" "92" "100" "69" ...
# 
# -- Prepare --
#
# ## tidyr and dplyr are already loaded
# 
# # Load the stringr and lubridate packages
# library(stringr)
# library(lubridate)
# 
# # Remove X's from day column
# weather3$day <- str_replace(weather3$day,"X","")
# 
# # Unite the year, month, and day columns
# weather4 <- unite(weather3, date, year, month, day, sep = "-")
# 
# # Convert date column to proper date format using lubridates's ymd()
# weather4$date <- ymd(weather4$date)
# 
# # Rearrange columns using dplyr's select()
# weather5 <- select(weather4, date, Events, CloudCover:WindDirDegrees)
# 
# # View the head
# head(weather5)
# 
# # View the structure of weather5
# str(weather5)
# 
# # Examine the first 20 rows of weather5. Are most of the characters numeric?
# head(weather5,20)
# 
# # See what happens if we try to convert PrecipitationIn to numeric
# as.numeric(weather5$PrecipitationIn)
# # Warning message: NAs introduced by coercion
# 
# ## The dplyr and stringr packages are already loaded
# 
# # Replace T with 0 (T = trace)
# weather5$PrecipitationIn <- str_replace(weather5$PrecipitationIn,"T",0)
# 
# # Convert characters to numerics
# # Run the call 
# # to mutate_each as-is to conveniently 
# # apply as.numeric() to all columns
# # from CloudCover through WindDirDegrees 
# # (reading left to right in the data), 
# weather6 <- mutate_each(weather5, funs(as.numeric), CloudCover:WindDirDegrees)
# 
# # Look at result
# str(weather6)
# 
# # Count missing values
# sum(is.na(weather6))
# # 6
# # Find missing values
# summary(weather6)
# 
# # Find indices of NAs in Max.Gust.SpeedMPH
# ind <- which(is.na(weather6$Max.Gust.SpeedMPH))
# # [1] 169 185 251 275 316 338
# 
# # Look at the full rows for records missing Max.Gust.SpeedMPH
# weather6[ind, ]
# 
# # # obvious error...
#
# # Review distributions for all variables
# summary(weather6)
# 
# # Find row with Max.Humidity of 1000
# ind <- which(weather6$Max.Humidity == 1000)
# 
# # Look at the data for that day
# weather6[ind, ]
# 
# # Change 1000 to 100
# weather6$Max.Humidity[ind] <- 100
# 
# # Look at summary of Mean.VisibilityMiles
# summary(weather6$Mean.VisibilityMiles)
# 
# # Get index of row with -1 value
# ind <- which(weather6$Mean.VisibilityMiles == -1)
# 
# # Look at full row
# weather6[ind,]
# 
# # Set Mean.VisibilityMiles to the appropriate value
# weather6$Mean.VisibilityMiles[ind] <- 10
# 
# # Clean up column names
# names(weather6) <- new_colnames
# 
# # Replace empty cells in events column
# weather6$events[weather6$events == ""] <- "None"
# 
# # Print the first 6 rows of weather6
# head(weather6,6)