1. Loading and preprocessing the data

Read a file in table format

activity <- read.csv(file = "activity.csv", header = TRUE, 
                     colClasses = c("integer","Date","integer"))

Display the Structure of dataset

str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

Return the first parts of dataset

head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

2. What is mean total number of steps taken per day?

Histogram of the total number of steps taken each day

activity %>%
  filter(steps != is.na(steps)) %>%
  group_by(date) %>%
  summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
  ggplot(aes(sumSteps)) +
  geom_histogram(bins = 20, color="black",fill="light blue") +
  xlab(label = "") +
  theme_light() 

Mean and median total number of steps taken per day

activity %>%
  group_by(date) %>%
  summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
  summarise(stepsByDayMean = mean(sumSteps, na.rm = TRUE), 
            stepsByDayMedian = median(sumSteps, na.rm = TRUE))
## # A tibble: 1 x 2
##   stepsByDayMean stepsByDayMedian
##            <dbl>            <int>
## 1        9354.23            10395

3. What is the average daily activity pattern

Time series plot of the average number of steps (5 minute interval (x-axis))

activity %>%
  filter(steps != is.na(steps)) %>%    
  group_by(interval) %>%
  summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
  arrange(interval) %>%
  ggplot(aes(y = MeanSteps, x = interval)) +
  geom_line() +
  xlab("5-minute interval") + 
  ylab("Average number of steps") +
  theme_light()

Max value of above plot

activity %>%
  group_by(interval) %>%
  summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
  summarise(MaxAverageNumberOfSteps = max(MeanSteps))
## # A tibble: 1 x 1
##   MaxAverageNumberOfSteps
##                     <dbl>
## 1                206.1698

4. Imputing missing values

Calculate and report of total number of missing values in the dataset

sapply(X = activity, FUN = function(X) sum(is.na(X)))
##    steps     date interval 
##     2304        0        0

Imput missing data - replace values “NA” with mean.

activity.imp <- activity
activity.imp[is.na(activity.imp), "steps"] = mean(activity.imp$steps, na.rm=TRUE)

Histogram of the total number of steps taken each day before and after missing values are imputed

activity$impute <- "before"
activity.imp$impute <- "after"
union_all(x = activity, y = activity.imp)  %>%
  filter(steps != is.na(steps)) %>% 
  group_by(date, impute) %>%
  summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
  ggplot(aes(sumSteps, fill=impute)) +
  geom_histogram(bins = 20) +
  xlab(label = "") +
  theme_light() 

Mean and median total number of steps taken each day before and after imputation

union_all(x = activity, y = activity.imp)  %>%
  group_by(date, impute) %>%
  summarise(sumSteps = sum(steps, na.rm = TRUE)) %>%
  group_by(impute) %>%
  summarise(stepByDayMean = mean(sumSteps), stepByDayMedian = median(sumSteps))
## # A tibble: 2 x 3
##   impute stepByDayMean stepByDayMedian
##    <chr>         <dbl>           <dbl>
## 1  after      10766.19        10766.19
## 2 before       9354.23        10395.00

5. Are there differences in activity patterns between weekdays and weekends ?

Create new factor variables in the dataset

variable weekend has two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

activity$weekday <- weekdays(activity$date, abbreviate = TRUE)
activity$weekday <- as.factor(x = activity$weekday)
activity$weekend <- ifelse(test = (activity$weekday == "sob." | activity$weekday ==  "niedz."), yes = "weekend", no = "weekday")
activity$weekend <- as.factor(activity$weekend)

Time series plot of the average number of steps (5 minute interval (x-axis)) taken per 5-minute interval across weekdays and weekends

activity %>%
  filter(steps != is.na(steps)) %>% 
  group_by(interval, weekend) %>%
  summarise(MeanSteps = mean(steps, na.rm = TRUE)) %>%
  ggplot(aes(y = MeanSteps, x = interval, col=weekend)) +
  geom_line() +
  xlab("5-minute interval") + 
  ylab("Average number of steps") +
  facet_grid(weekend ~ .) +
  theme_light()