Repdata

Load data and libraries

activity <- read.csv("activity.csv")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

total number of steps taken per day

histogram of total number of steps per day

unique(activity$date)

##  [1] "2012-10-01" "2012-10-02" "2012-10-03" "2012-10-04" "2012-10-05"
##  [6] "2012-10-06" "2012-10-07" "2012-10-08" "2012-10-09" "2012-10-10"
## [11] "2012-10-11" "2012-10-12" "2012-10-13" "2012-10-14" "2012-10-15"
## [16] "2012-10-16" "2012-10-17" "2012-10-18" "2012-10-19" "2012-10-20"
## [21] "2012-10-21" "2012-10-22" "2012-10-23" "2012-10-24" "2012-10-25"
## [26] "2012-10-26" "2012-10-27" "2012-10-28" "2012-10-29" "2012-10-30"
## [31] "2012-10-31" "2012-11-01" "2012-11-02" "2012-11-03" "2012-11-04"
## [36] "2012-11-05" "2012-11-06" "2012-11-07" "2012-11-08" "2012-11-09"
## [41] "2012-11-10" "2012-11-11" "2012-11-12" "2012-11-13" "2012-11-14"
## [46] "2012-11-15" "2012-11-16" "2012-11-17" "2012-11-18" "2012-11-19"
## [51] "2012-11-20" "2012-11-21" "2012-11-22" "2012-11-23" "2012-11-24"
## [56] "2012-11-25" "2012-11-26" "2012-11-27" "2012-11-28" "2012-11-29"
## [61] "2012-11-30"

totalsteps <- activity %>% group_by(date) %>% summarise(total = sum(steps, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

totalsteps

## # A tibble: 61 x 2
##    date       total
##    <chr>      <int>
##  1 2012-10-01     0
##  2 2012-10-02   126
##  3 2012-10-03 11352
##  4 2012-10-04 12116
##  5 2012-10-05 13294
##  6 2012-10-06 15420
##  7 2012-10-07 11015
##  8 2012-10-08     0
##  9 2012-10-09 12811
## 10 2012-10-10  9900
## # ... with 51 more rows

ggplot(totalsteps, aes(total)) + geom_histogram(binwidth = 2500, fill = blues9) + xlab("Total number of steps per day") + ylab("count of total number of steps per day") + ggtitle("Histogram of steps per day")

mean and median number of total number of steps taken per day

mean(totalsteps$total, na.rm = TRUE)

## [1] 9354.23

median(totalsteps$total, na.rm = TRUE)

## [1] 10395

Average daily pattern

avesteps <- activity %>% group_by(interval) %>% summarise(average = mean(steps, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

avesteps$interval <- (avesteps$interval)*100
ggplot(avesteps, aes(x= interval, y = average)) + geom_line(color = "steelblue", size = 1, alpha = 0.8)

avesteps[which.max(avesteps$average),] #maximum number of steps lies at interval 8.35

## # A tibble: 1 x 2
##   interval average
##      <dbl>   <dbl>
## 1    83500    206.

Imputing missing values

number of NA values in activity dataset

colSums(is.na(activity))

##    steps     date interval 
##     2304        0        0

replace missing values

NA values will be replaced by the average number of steps in the 5 minute interval Create a new column complete_steps with all NA values replaced

activity$Complete_steps <- ifelse(is.na(activity$steps), round(avesteps$average[match(activity$interval, avesteps$interval)],0), activity$steps)
#merge 2 columns complete_steps and steps in the activity dataset
Finalactivity <- data.frame(steps = activity$Complete_steps, interval = activity$interval, date = activity$date)
head(Finalactivity, n = 5) #first 5 rows of new activity data

##   steps interval       date
## 1     2        0 2012-10-01
## 2    NA        5 2012-10-01
## 3    NA       10 2012-10-01
## 4    NA       15 2012-10-01
## 5    NA       20 2012-10-01

Histogram of total number of steps

Finalactivity_total <- Finalactivity %>% group_by(date) %>% summarise(total = sum(steps, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(Finalactivity_total, aes(total)) + geom_histogram(binwidth = 2500, fill = blues9)

mean and median total number of steps per day

mean(Finalactivity_total$total)

## [1] 9354.492

median(Finalactivity_total$total)

## [1] 10395

# there is not much impact on the mean and median.

Differences in activity patterns between weekdays and weekends

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

activity$date <- as.Date(activity$date)
activity$day <- weekdays(activity$date)
unique(activity$day)

## [1] "Monday"    "Tuesday"   "Wednesday" "Thursday"  "Friday"    "Saturday" 
## [7] "Sunday"

activity$daytype <- ifelse(activity$day == "Saturday" | activity$day == "Sunday",'Weekend','Weekday')
unique(activity$daytype)

## [1] "Weekday" "Weekend"

activity_grouped <- activity %>% group_by(daytype, interval) %>% summarise(ave = mean(steps, na.rm = TRUE))

## `summarise()` regrouping output by 'daytype' (override with `.groups` argument)

ggplot(activity_grouped, aes(x = interval, y = ave)) + geom_line(color = "steelblue") + facet_grid(daytype ~.) + ylab("average number of steps")

## End