knitr::: opts_chunk$set(echo = TRUE, results = "asis")

Loading Libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xtable)
library(data.table)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Loading and preprocessing the data

setwd("/Users/aleksandr/Documents/R")
data <- read.csv("activity.csv")

# changing the class of the variable "steps" to be Numeric:
data$steps <- as.numeric(data$steps)

# changing the class of the variable "interval" to be Numeric:
data$interval <- as.numeric(data$interval)

# removing NAs.
cleandata <- data[complete.cases(data), ]  

Some information about the dataset:

names(data)

[1] “steps” “date” “interval”

xt <- xtable(head(data))
print(xt, type="html")
steps date interval
1 2012-10-01 0.00
2 2012-10-01 5.00
3 2012-10-01 10.00
4 2012-10-01 15.00
5 2012-10-01 20.00
6 2012-10-01 25.00
str(data)

‘data.frame’: 17568 obs. of 3 variables: $ steps : num NA NA NA NA NA NA NA NA NA NA … $ date : Factor w/ 61 levels “2012-10-01”,“2012-10-02”,..: 1 1 1 1 1 1 1 1 1 1 … $ interval: num 0 5 10 15 20 25 30 35 40 45 …

xt3 <- xtable(summary(data))
print(xt3, type="html")
 steps </th> <th>         date </th> <th>    interval </th>  </tr>
1 Min. : 0.00 2012-10-01: 288 Min. : 0.0
2 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
3 Median : 0.00 2012-10-03: 288 Median :1177.5
4 Mean : 37.38 2012-10-04: 288 Mean :1177.5
5 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
6 Max. :806.00 2012-10-06: 288 Max. :2355.0
7 NA’s :2304 (Other) :15840

1) What is mean total number of steps taken per day?

The below histogram answers the following questions:

  • Make a histogram of the total number of steps taken each day
  • Calculate and report the mean and median total number of steps taken per day
dt = data.table(cleandata)
totalsteps = dt[, list(total_steps = sum(steps)), by = date]

hist(totalsteps$total_steps, 
     breaks = 25,
     main = "Number of steps per day",
     xlab = "Total number of steps", col = "grey",
     
     cex.main = 1,5)

Calculate and report the mean and median total number of steps taken per day

# mean of the steps
mean(totalsteps$total_steps)

[1] 10766.19

# median of the steps
median(totalsteps$total_steps)

[1] 10765

2) What is the average daily activity pattern?

Total number of steps taken each day

#summarize dataset by interval
mintv = dt[, list(avg_steps = mean(steps)), by = interval]
# make a plot
with(mintv, {
        plot(interval, avg_steps, type = "l",
             main = "Average number of steps by time interval",
             xlab = "5 Minute time interval",
             ylab = "Average number of steps")
})

Calculate and report the mean and median total number of steps taken per day

#Find the interval with maximum averege steps and pritn it
max_steps <- xtable(mintv[which.max(avg_steps), ])
print(max_steps, type="html")
interval avg_steps
1 835.00 206.17

Imputing missing values

Calculate & Report The Number of Missing Values

sum(is.na(data$steps))

[1] 2304

Fill NAs fields and creat a new dataset

dt <- data.table(data)
#Create function that will return the second value if the first value is NA
replace_NA <- function(x,y){
        if(is.na(x)){
                
                return(y)
        }
        return(x)
}


#create new column with replaced NAs by average values
dt$new_steps <- mapply(replace_NA, dt$steps, mintv$avg_steps)

#summaryize new dataset by day
dt_last <- dt[, list(new_steps = sum(new_steps, na.rm = T)), 
                                    by = date]

#preview new dataset
dt_lastT <- xtable(head(dt_last))
print(dt_lastT, type="html")
date new_steps
1 2012-10-01 10766.19
2 2012-10-02 126.00
3 2012-10-03 11352.00
4 2012-10-04 12116.00
5 2012-10-05 13294.00
6 2012-10-06 15420.00

Make a histogram of the total number of steps taken each day

# make a plot
hist(dt_last$new_steps, 
     breaks = 25,
     main = "Number of steps per day",
     xlab = "Total number of steps", col = "grey",
     
     cex.main = 1,5)

Mean and median total number of steps taken per day

mean(dt_last$new_steps)

[1] 10766.19

median(dt_last$new_steps)

[1] 10766.19

Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

#Add name of week
dt_last$dayname <- weekdays(as.Date(dt_last$date))

#Make function to return "Weekday" or "Weekend"
weekday.or.weekend <- function(day) {
        
        if (day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))
                return("weekday")
        else if (day %in% c("Saturday", "Sunday"))
                return("weekend")
        else
                stop("invalid date")
}

#Add factor variable either Weekday and Weekend to each of the day
dt_last$daytype <- as.factor(apply(as.matrix(dt_last$dayname), 1, FUN = weekday.or.weekend))

#preview new dataset
str(dt_last)

Classes ‘data.table’ and ‘data.frame’: 61 obs. of 4 variables: $ date : Factor w/ 61 levels “2012-10-01”,“2012-10-02”,..: 1 2 3 4 5 6 7 8 9 10 … $ new_steps: num 10766 126 11352 12116 13294 … $ dayname : chr “Monday” “Tuesday” “Wednesday” “Thursday” … $ daytype : Factor w/ 2 levels “weekday”,“weekend”: 1 1 1 1 1 2 2 1 1 1 … - attr(*, “.internal.selfref”)=

dt_lastT <- xtable(head(dt_last))
print(dt_lastT, type="html")
date new_steps dayname daytype
1 2012-10-01 10766.19 Monday weekday
2 2012-10-02 126.00 Tuesday weekday
3 2012-10-03 11352.00 Wednesday weekday
4 2012-10-04 12116.00 Thursday weekday
5 2012-10-05 13294.00 Friday weekday
6 2012-10-06 15420.00 Saturday weekend

Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).