# Special values demo
c(-1, 0, 1) / 0
## [1] -Inf NaN Inf
file.choose()
# change this path if needed to match your file location
bike <- read.csv("/Users/eliasbekdas/Library/CloudStorage/OneDrive-Personal/DATA4310/assignment1/bike_sharing_data.csv")
str(bike)
## 'data.frame': 17379 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summary(bike)
## datetime season holiday workingday
## Length:17379 Min. :1.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :3.000 Median :0.00000 Median :1.0000
## Mean :2.502 Mean :0.02877 Mean :0.6827
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.00000 Max. :1.0000
## weather temp atemp humidity
## Min. :1.000 Min. : 0.82 Min. : 0.00 Length:17379
## 1st Qu.:1.000 1st Qu.:13.94 1st Qu.:16.66 Class :character
## Median :1.000 Median :20.50 Median :24.24 Mode :character
## Mean :1.425 Mean :20.38 Mean :23.79
## 3rd Qu.:2.000 3rd Qu.:27.06 3rd Qu.:31.06
## Max. :4.000 Max. :41.00 Max. :50.00
## windspeed casual registered count
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 1
## 1st Qu.: 7.002 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42
## Median :12.998 Median : 16.00 Median :116.0 Median :141
## Mean :12.737 Mean : 34.48 Mean :152.5 Mean :187
## 3rd Qu.:16.998 3rd Qu.: 46.00 3rd Qu.:217.0 3rd Qu.:277
## Max. :56.997 Max. :367.00 Max. :886.0 Max. :977
## sources
## Length:17379
## Class :character
## Mode :character
##
##
##
sum(is.na(bike))
## [1] 554
apply(is.na(bike), 2, sum)
## datetime season holiday workingday weather temp atemp
## 0 0 0 0 0 0 0
## humidity windspeed casual registered count sources
## 0 0 0 0 0 554
library(stringr)
str_detect(bike, "NA")
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] TRUE
bike_complete <- bike[-which(is.na(bike$sources)),]
str(bike_complete)
## 'data.frame': 16825 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
bike_complete1 <- bike[complete.cases(bike),]
str(bike_complete1)
## 'data.frame': 16825 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
bike_complete2 <- na.omit(bike)
str(bike_complete2)
## 'data.frame': 16825 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
## - attr(*, "na.action")= 'omit' Named int [1:554] 27 33 36 71 84 115 116 131 133 138 ...
## ..- attr(*, "names")= chr [1:554] "27" "33" "36" "71" ...
table(bike$humidity)
##
## 0 10 100 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 22 1 270 1 1 2 4 10 10 10 16 17 26 27 46 56 59 78 71 97
## 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 106 113 118 99 162 133 163 187 224 186 209 224 290 235 270 244 248 316 247 240
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## 327 266 262 312 267 287 352 310 231 258 272 267 335 325 163 219 387 388 161 172
## 69 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87
## 359 430 193 191 317 341 222 219 336 327 238 1 107 275 299 630 124 5 76 488
## 88 89 90 91 92 93 94 96 97 x61
## 657 239 7 1 2 331 560 3 1 1
bad_data <- str_subset(bike$humidity, "[a-z A-Z]")
bad_data
## [1] "x61"
bike[which(bike$humidity=="x61"),"humidity"] <- 61
table(bike$humidity)
##
## 0 10 100 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 22 1 270 1 1 2 4 10 10 10 16 17 26 27 46 56 59 78 71 97
## 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 106 113 118 99 162 133 163 187 224 186 209 224 290 235 270 244 248 316 247 240
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## 327 266 262 312 267 287 352 310 231 258 272 267 336 325 163 219 387 388 161 172
## 69 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87
## 359 430 193 191 317 341 222 219 336 327 238 1 107 275 299 630 124 5 76 488
## 88 89 90 91 92 93 94 96 97
## 657 239 7 1 2 331 560 3 1
typeof(bike$humidity)
## [1] "character"
bike$humidity <- as.integer(bike$humidity)
typeof(bike$humidity)
## [1] "integer"
hist(bike$humidity)
class(bike$workingday)
## [1] "integer"
bike$workingday <- as.factor(bike$workingday)
class(bike$workingday)
## [1] "factor"
bike$holiday <- factor(bike$holiday,
levels = c(0,1),
labels = c("No", "Yes"))
class(bike$holiday)
## [1] "factor"
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
class(bike$datetime)
## [1] "character"
bike$datetime <- mdy_hm(bike$datetime)
class(bike$datetime)
## [1] "POSIXct" "POSIXt"
table(bike_complete$sources)
##
## ad campaign Ad Campaign AD campaign blog
## 3472 851 894 494
## direct facebook page Twitter Twitter
## 1610 1551 890 855
## www.bing.com www.google.co.uk www.google.com www.google.fi
## 1595 1553 527 828
## www.yahoo.com
## 1705
bike_complete$sources <- tolower(bike_complete$sources)
table(bike_complete$sources)
##
## ad campaign blog direct facebook page
## 5217 494 1610 1551
## twitter twitter www.bing.com www.google.co.uk
## 890 855 1595 1553
## www.google.com www.google.fi www.yahoo.com
## 527 828 1705
bike_complete$sources <- str_trim(bike_complete$sources, "both")
table(bike_complete$sources)
##
## ad campaign blog direct facebook page
## 5217 494 1610 1551
## twitter www.bing.com www.google.co.uk www.google.com
## 1745 1595 1553 527
## www.google.fi www.yahoo.com
## 828 1705
goog_indices <- grep("google", bike_complete$sources)
bike_complete$sources[goog_indices] <- "google"
table(bike_complete$sources)
##
## ad campaign blog direct facebook page google
## 5217 494 1610 1551 2908
## twitter www.bing.com www.yahoo.com
## 1745 1595 1705
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
str(dirty_iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 6.4 6.3 6.2 5 5.7 5.3 6.4 5.9 5.8 4.8 ...
## $ Sepal.Width : num 3.2 3.3 NA 3.4 2.6 NA 2.7 3 2.7 3.1 ...
## $ Petal.Length: num 4.5 6 5.4 1.6 3.5 NA 5.3 5.1 4.1 1.6 ...
## $ Petal.Width : num 1.5 2.5 2.3 0.4 1 0.2 NA 1.8 1 0.2 ...
## $ Species : chr "versicolor" "virginica" "virginica" "setosa" ...
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
sum(complete.cases(dirty_iris))
## [1] 96
sum(complete.cases(dirty_iris)) / nrow(dirty_iris) * 100
## [1] 64
sapply(dirty_iris, function(x) any(is.infinite(x), na.rm = TRUE))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## FALSE FALSE FALSE TRUE FALSE
dirty_iris[is.infinite(as.matrix(dirty_iris))] <- NA
sapply(dirty_iris, function(x) any(is.infinite(x), na.rm = TRUE))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## FALSE FALSE FALSE TRUE FALSE
viol <- subset(dirty_iris, Sepal.Width <= 0 | Sepal.Length > 30)
nrow(viol)
## [1] 4
viol
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5.0 -3 3.5 1.0 versicolor
## 28 73.0 29 63.0 NA virginica
## 125 49.0 30 14.0 2.0 setosa
## 130 5.7 0 1.7 0.3 setosa
neg_idx <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width < 0
dirty_iris$Sepal.Width[neg_idx] <- abs(dirty_iris$Sepal.Width[neg_idx])
zero_idx <- !is.na(dirty_iris$Sepal.Width) & dirty_iris$Sepal.Width == 0
dirty_iris$Sepal.Width[zero_idx] <- NA
dirty_iris$Sepal.Width[dirty_iris$Sepal.Width == 0] <- NA
summary(dirty_iris$Sepal.Width)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.200 2.800 3.000 3.462 3.300 30.000 18
## Q9: Impute Sepal.Length using linear regression
# Define predictors
predictors <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
# Remove rows with NA, NaN, or Inf in any of these columns
reg_data <- dirty_iris[complete.cases(dirty_iris[, predictors]), ]
reg_data <- reg_data[!apply(reg_data[, predictors], 1, function(x) any(is.infinite(x) | is.nan(x))), ]
# Fit the model
lm_model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = reg_data)
# Identify missing Sepal.Length rows in the original dataset
need_pred <- is.na(dirty_iris$Sepal.Length)
# Predict and fill
dirty_iris$Sepal.Length[need_pred] <- predict(lm_model, newdata = dirty_iris[need_pred, ])
# Q1: Technically correct data = B
# Q2: Consistent data = C
# Q3: 19 missing = D
# Q4: 96 rows, 64% = B
# Q5: Inf = C
# Q6: True
# Q7: 4 violations = E
# Q8: True
# Q9: A, B, C, D
# Q10: None of the above = D