knitr::: opts_chunk$set(echo = TRUE, results = "asis")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(xtable)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
setwd("/Users/aleksandr/Documents/R")
data <- read.csv("activity.csv")
# changing the class of the variable "steps" to be Numeric:
data$steps <- as.numeric(data$steps)
# changing the class of the variable "interval" to be Numeric:
data$interval <- as.numeric(data$interval)
# removing NAs.
cleandata <- data[complete.cases(data), ]
names(data)
[1] “steps” “date” “interval”
xt <- xtable(head(data))
print(xt, type="html")
| steps | date | interval | |
|---|---|---|---|
| 1 | 2012-10-01 | 0.00 | |
| 2 | 2012-10-01 | 5.00 | |
| 3 | 2012-10-01 | 10.00 | |
| 4 | 2012-10-01 | 15.00 | |
| 5 | 2012-10-01 | 20.00 | |
| 6 | 2012-10-01 | 25.00 |
str(data)
‘data.frame’: 17568 obs. of 3 variables: $ steps : num NA NA NA NA NA NA NA NA NA NA … $ date : Factor w/ 61 levels “2012-10-01”,“2012-10-02”,..: 1 1 1 1 1 1 1 1 1 1 … $ interval: num 0 5 10 15 20 25 30 35 40 45 …
xt3 <- xtable(summary(data))
print(xt3, type="html")
| |||
|---|---|---|---|
| 1 | Min. : 0.00 | 2012-10-01: 288 | Min. : 0.0 |
| 2 | 1st Qu.: 0.00 | 2012-10-02: 288 | 1st Qu.: 588.8 |
| 3 | Median : 0.00 | 2012-10-03: 288 | Median :1177.5 |
| 4 | Mean : 37.38 | 2012-10-04: 288 | Mean :1177.5 |
| 5 | 3rd Qu.: 12.00 | 2012-10-05: 288 | 3rd Qu.:1766.2 |
| 6 | Max. :806.00 | 2012-10-06: 288 | Max. :2355.0 |
| 7 | NA’s :2304 | (Other) :15840 |
dt = data.table(cleandata)
totalsteps = dt[, list(total_steps = sum(steps)), by = date]
hist(totalsteps$total_steps,
breaks = 25,
main = "Number of steps per day",
xlab = "Total number of steps", col = "grey",
cex.main = 1,5)
# mean of the steps
mean(totalsteps$total_steps)
[1] 10766.19
# median of the steps
median(totalsteps$total_steps)
[1] 10765
#summarize dataset by interval
mintv = dt[, list(avg_steps = mean(steps)), by = interval]
# make a plot
with(mintv, {
plot(interval, avg_steps, type = "l",
main = "Average number of steps by time interval",
xlab = "5 Minute time interval",
ylab = "Average number of steps")
})
#Find the interval with maximum averege steps and pritn it
max_steps <- xtable(mintv[which.max(avg_steps), ])
print(max_steps, type="html")
| interval | avg_steps | |
|---|---|---|
| 1 | 835.00 | 206.17 |
sum(is.na(data$steps))
[1] 2304
dt <- data.table(data)
#Create function that will return the second value if the first value is NA
replace_NA <- function(x,y){
if(is.na(x)){
return(y)
}
return(x)
}
#create new column with replaced NAs by average values
dt$new_steps <- mapply(replace_NA, dt$steps, mintv$avg_steps)
#summaryize new dataset by day
dt_last <- dt[, list(new_steps = sum(new_steps, na.rm = T)),
by = date]
#preview new dataset
dt_lastT <- xtable(head(dt_last))
print(dt_lastT, type="html")
| date | new_steps | |
|---|---|---|
| 1 | 2012-10-01 | 10766.19 |
| 2 | 2012-10-02 | 126.00 |
| 3 | 2012-10-03 | 11352.00 |
| 4 | 2012-10-04 | 12116.00 |
| 5 | 2012-10-05 | 13294.00 |
| 6 | 2012-10-06 | 15420.00 |
# make a plot
hist(dt_last$new_steps,
breaks = 25,
main = "Number of steps per day",
xlab = "Total number of steps", col = "grey",
cex.main = 1,5)
mean(dt_last$new_steps)
[1] 10766.19
median(dt_last$new_steps)
[1] 10766.19
#Add name of week
dt_last$dayname <- weekdays(as.Date(dt_last$date))
#Make function to return "Weekday" or "Weekend"
weekday.or.weekend <- function(day) {
if (day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))
return("weekday")
else if (day %in% c("Saturday", "Sunday"))
return("weekend")
else
stop("invalid date")
}
#Add factor variable either Weekday and Weekend to each of the day
dt_last$daytype <- as.factor(apply(as.matrix(dt_last$dayname), 1, FUN = weekday.or.weekend))
#preview new dataset
str(dt_last)
Classes ‘data.table’ and ‘data.frame’: 61 obs. of 4 variables: $ date : Factor w/ 61 levels “2012-10-01”,“2012-10-02”,..: 1 2 3 4 5 6 7 8 9 10 … $ new_steps: num 10766 126 11352 12116 13294 … $ dayname : chr “Monday” “Tuesday” “Wednesday” “Thursday” … $ daytype : Factor w/ 2 levels “weekday”,“weekend”: 1 1 1 1 1 2 2 1 1 1 … - attr(*, “.internal.selfref”)=
dt_lastT <- xtable(head(dt_last))
print(dt_lastT, type="html")
| date | new_steps | dayname | daytype | |
|---|---|---|---|---|
| 1 | 2012-10-01 | 10766.19 | Monday | weekday |
| 2 | 2012-10-02 | 126.00 | Tuesday | weekday |
| 3 | 2012-10-03 | 11352.00 | Wednesday | weekday |
| 4 | 2012-10-04 | 12116.00 | Thursday | weekday |
| 5 | 2012-10-05 | 13294.00 | Friday | weekday |
| 6 | 2012-10-06 | 15420.00 | Saturday | weekend |