setwd("D:/Coursera data Science/Reproducable Research/week 2assignment/repdata%2Fdata%2Factivity")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
activity<-read.csv("./activity.csv")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
summary(activity)
## steps date interval
## Min. : 0.00 2012-10-01: 288 Min. : 0.0
## 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
## Median : 0.00 2012-10-03: 288 Median :1177.5
## Mean : 37.38 2012-10-04: 288 Mean :1177.5
## 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
## Max. :806.00 2012-10-06: 288 Max. :2355.0
## NA's :2304 (Other) :15840
activity.complete<-na.omit(activity)
For this part of the assignment, you can ignore the missing values in the dataset.
Calculate the total number of steps taken per day
activity.day<-group_by(activity.complete,date)
activity.day<-summarise(activity.day,steps=sum(steps))
summary(activity.day)
## date steps
## 2012-10-02: 1 Min. : 41
## 2012-10-03: 1 1st Qu.: 8841
## 2012-10-04: 1 Median :10765
## 2012-10-05: 1 Mean :10766
## 2012-10-06: 1 3rd Qu.:13294
## 2012-10-07: 1 Max. :21194
## (Other) :47
library(ggplot2)
qplot(steps,data = activity.day)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###Calculate and report the mean and median of the total number of steps taken per day
mean(activity.day$steps)
## [1] 10766.19
median(activity.day$steps)
## [1] 10765
Make a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
activity.interval<-group_by(activity.complete,interval)
activity.interval<-summarise(activity.interval,steps=mean(steps))
summary(activity.interval)
## interval steps
## Min. : 0.0 Min. : 0.000
## 1st Qu.: 588.8 1st Qu.: 2.486
## Median :1177.5 Median : 34.113
## Mean :1177.5 Mean : 37.383
## 3rd Qu.:1766.2 3rd Qu.: 52.835
## Max. :2355.0 Max. :206.170
ggplot(activity.interval,aes(interval,steps))+geom_line()
activity.interval[activity.interval$steps==max(activity.interval$steps),]
## # A tibble: 1 x 2
## interval steps
## <int> <dbl>
## 1 835 206.1698
Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs) ## The total number of rows with NAs is equal to the difference #between the number of rows in the raw data and #the number of rows in the data with only complete cases:
nrow(activity)-nrow(activity.complete)
## [1] 2304
names(activity.interval)[2]<-"mean.steps"
activity.impute<-merge(activity,activity.interval)
activity.impute$steps[is.na(activity.impute$steps)]<-activity.impute$mean.steps[is.na(activity.impute$steps)]
the summary and plots are as follows as well
activity.day.impute<-group_by(activity.impute,date)
activity.day.impute<-summarise(activity.day.impute,steps=sum(steps))
qplot(steps,data = activity.day.impute)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mean(activity.day.impute$steps)
## [1] 10766.19
median(activity.day.impute$steps)
## [1] 10766.19
For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
Create a new factor variable in the dataset with two levels - “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
activity.impute$daysofweek<-weekdays(as.Date(activity.impute$date))
activity.impute$weekend<-as.factor(activity.impute$daysofweek=="Saturday"|activity.impute$daysofweek=="Sunday")
levels(activity.impute$weekend)<-c("Weekdays","Weekend")
the plot is below as well
activity.weekday<-activity.impute[activity.impute$weekend=="Weekdays",]
activity.weekend<-activity.impute[activity.impute$weekend=="Weekend",]
activity.interval.weekday<-group_by(activity.weekday,interval)
activity.interval.weekday<-summarise(activity.interval.weekday,steps=mean(steps))
activity.interval.weekday$weekend<-"Weekday"
activity.interval.weekend<-group_by(activity.weekend,interval)
activity.interval.weekend<-summarise(activity.interval.weekend,steps=mean(steps))
activity.interval.weekend$weekend<-"Weekend"
act.int<-rbind(activity.interval.weekday,activity.interval.weekend)
act.int$weekend<-as.factor(act.int$weekend)
ggplot(act.int,aes(interval,steps))+geom_line()+facet_grid(weekend~.)