# Special values demo
c(-1, 0, 1) / 0
## [1] -Inf  NaN  Inf

file.choose()

Load & Inspect Bike Data

# change this path if needed to match your file location
bike <- read.csv("/Users/eliasbekdas/Library/CloudStorage/OneDrive-Personal/DATA4310/assignment1/bike_sharing_data.csv")

str(bike)
## 'data.frame':    17379 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summary(bike)
##    datetime             season         holiday          workingday    
##  Length:17379       Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :3.000   Median :0.00000   Median :1.0000  
##                     Mean   :2.502   Mean   :0.02877   Mean   :0.6827  
##                     3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :4.000   Max.   :1.00000   Max.   :1.0000  
##     weather           temp           atemp         humidity        
##  Min.   :1.000   Min.   : 0.82   Min.   : 0.00   Length:17379      
##  1st Qu.:1.000   1st Qu.:13.94   1st Qu.:16.66   Class :character  
##  Median :1.000   Median :20.50   Median :24.24   Mode  :character  
##  Mean   :1.425   Mean   :20.38   Mean   :23.79                     
##  3rd Qu.:2.000   3rd Qu.:27.06   3rd Qu.:31.06                     
##  Max.   :4.000   Max.   :41.00   Max.   :50.00                     
##    windspeed          casual         registered        count    
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.0   Min.   :  1  
##  1st Qu.: 7.002   1st Qu.:  4.00   1st Qu.: 36.0   1st Qu.: 42  
##  Median :12.998   Median : 16.00   Median :116.0   Median :141  
##  Mean   :12.737   Mean   : 34.48   Mean   :152.5   Mean   :187  
##  3rd Qu.:16.998   3rd Qu.: 46.00   3rd Qu.:217.0   3rd Qu.:277  
##  Max.   :56.997   Max.   :367.00   Max.   :886.0   Max.   :977  
##    sources         
##  Length:17379      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Missing Values

sum(is.na(bike))
## [1] 554
apply(is.na(bike), 2, sum)
##   datetime     season    holiday workingday    weather       temp      atemp 
##          0          0          0          0          0          0          0 
##   humidity  windspeed     casual registered      count    sources 
##          0          0          0          0          0        554
library(stringr)
str_detect(bike, "NA")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13]  TRUE

Handle Missing Observations

bike_complete <- bike[-which(is.na(bike$sources)),]
str(bike_complete)
## 'data.frame':    16825 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
bike_complete1 <- bike[complete.cases(bike),]
str(bike_complete1)
## 'data.frame':    16825 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
bike_complete2 <- na.omit(bike)
str(bike_complete2)
## 'data.frame':    16825 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
##  - attr(*, "na.action")= 'omit' Named int [1:554] 27 33 36 71 84 115 116 131 133 138 ...
##   ..- attr(*, "names")= chr [1:554] "27" "33" "36" "71" ...

Fix Errors in Humidity

table(bike$humidity)
## 
##   0  10 100  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28 
##  22   1 270   1   1   2   4  10  10  10  16  17  26  27  46  56  59  78  71  97 
##  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48 
## 106 113 118  99 162 133 163 187 224 186 209 224 290 235 270 244 248 316 247 240 
##  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68 
## 327 266 262 312 267 287 352 310 231 258 272 267 335 325 163 219 387 388 161 172 
##  69  70  71  72  73  74  75  76  77  78  79   8  80  81  82  83  84  85  86  87 
## 359 430 193 191 317 341 222 219 336 327 238   1 107 275 299 630 124   5  76 488 
##  88  89  90  91  92  93  94  96  97 x61 
## 657 239   7   1   2 331 560   3   1   1
bad_data <- str_subset(bike$humidity, "[a-z A-Z]")
bad_data
## [1] "x61"
bike[which(bike$humidity=="x61"),"humidity"] <- 61
table(bike$humidity)
## 
##   0  10 100  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28 
##  22   1 270   1   1   2   4  10  10  10  16  17  26  27  46  56  59  78  71  97 
##  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48 
## 106 113 118  99 162 133 163 187 224 186 209 224 290 235 270 244 248 316 247 240 
##  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68 
## 327 266 262 312 267 287 352 310 231 258 272 267 336 325 163 219 387 388 161 172 
##  69  70  71  72  73  74  75  76  77  78  79   8  80  81  82  83  84  85  86  87 
## 359 430 193 191 317 341 222 219 336 327 238   1 107 275 299 630 124   5  76 488 
##  88  89  90  91  92  93  94  96  97 
## 657 239   7   1   2 331 560   3   1
typeof(bike$humidity)
## [1] "character"
bike$humidity <- as.integer(bike$humidity)
typeof(bike$humidity)
## [1] "integer"
hist(bike$humidity)

Recode Factors and Datetime

class(bike$workingday)
## [1] "integer"
bike$workingday <- as.factor(bike$workingday)
class(bike$workingday)
## [1] "factor"
bike$holiday <- factor(bike$holiday, 
                       levels = c(0,1),
                       labels = c("No", "Yes"))
class(bike$holiday)
## [1] "factor"
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
class(bike$datetime)
## [1] "character"
bike$datetime <- mdy_hm(bike$datetime)
class(bike$datetime)
## [1] "POSIXct" "POSIXt"

Normalize Text Data (sources)

table(bike_complete$sources)
## 
##      ad campaign      Ad Campaign      AD campaign             blog 
##             3472              851              894              494 
##           direct    facebook page          Twitter      Twitter     
##             1610             1551              890              855 
##     www.bing.com www.google.co.uk   www.google.com    www.google.fi 
##             1595             1553              527              828 
##    www.yahoo.com 
##             1705
bike_complete$sources <- tolower(bike_complete$sources)
table(bike_complete$sources)
## 
##      ad campaign             blog           direct    facebook page 
##             5217              494             1610             1551 
##          twitter      twitter         www.bing.com www.google.co.uk 
##              890              855             1595             1553 
##   www.google.com    www.google.fi    www.yahoo.com 
##              527              828             1705
bike_complete$sources <- str_trim(bike_complete$sources, "both")
table(bike_complete$sources)
## 
##      ad campaign             blog           direct    facebook page 
##             5217              494             1610             1551 
##          twitter     www.bing.com www.google.co.uk   www.google.com 
##             1745             1595             1553              527 
##    www.google.fi    www.yahoo.com 
##              828             1705
goog_indices <- grep("google", bike_complete$sources)
bike_complete$sources[goog_indices] <- "google"
table(bike_complete$sources)
## 
##   ad campaign          blog        direct facebook page        google 
##          5217           494          1610          1551          2908 
##       twitter  www.bing.com www.yahoo.com 
##          1745          1595          1705

Dirty Iris Dataset

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
##  $ Sepal.Width : num  3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
##  $ Petal.Length: num  4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
##  $ Petal.Width : num  1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
##  $ Species     : chr  "versicolor" "virginica" "virginica" "setosa" ...

Q3: Missing Petal.Length

sum(is.na(dirty_iris$Petal.Length))
## [1] 19

Q4: Complete Rows

sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris) * 100
## [1] 64

Q5–Q6: Special Values (Inf)

sapply(dirty_iris, function(x) any(is.infinite(x), na.rm = TRUE))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##        FALSE        FALSE        FALSE         TRUE        FALSE
dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA
sapply(dirty_iris, function(x) any(is.infinite(x), na.rm = TRUE))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##        FALSE        FALSE        FALSE         TRUE        FALSE

Q7: Rule Violations

viol <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
nrow(viol)
## [1] 4
viol
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16           5.0          -3          3.5         1.0 versicolor
## 28          73.0          29         63.0          NA  virginica
## 125         49.0          30         14.0         2.0     setosa
## 130          5.7           0          1.7         0.3     setosa

Q8: Correct Sepal.Width

neg_idx <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])

zero_idx <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0
dirty_iris$Sepal.Width[zero_idx] <- NA

dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA

summary(dirty_iris$Sepal.Width)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.200   2.800   3.000   3.462   3.300  30.000      18

Q9: Imputations

## Q9: Impute Sepal.Length using linear regression

# Define predictors
predictors <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")

# Remove rows with NA, NaN, or Inf in any of these columns
reg_data <- dirty_iris[complete.cases(dirty_iris[, predictors]), ]
reg_data <- reg_data[!apply(reg_data[, predictors], 1, function(x) any(is.infinite(x) | is.nan(x))), ]

# Fit the model
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = reg_data)

# Identify missing Sepal.Length rows in the original dataset
need_pred <- is.na(dirty_iris$Sepal.Length)

# Predict and fill
dirty_iris$Sepal.Length[need_pred] <- predict(lm_model, newdata = dirty_iris[need_pred, ])

Test Answers

# Q1: Technically correct data = B  
# Q2: Consistent data = C  
# Q3: 19 missing = D  
# Q4: 96 rows, 64% = B  
# Q5: Inf = C  
# Q6: True  
# Q7: 4 violations = E  
# Q8: True  
# Q9: A, B, C, D  
# Q10: None of the above = D