knitr::opts_chunk$set(fig.path='Figs/')
library(ggplot2)
library(dplyr)
library(mice)
library(lattice)

Loading and preprocessing the data

activity <- read.csv("activity.csv")
#structure:
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
#data:
head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
#aggregating data by day:
day<-aggregate(steps~date,activity,sum)
head(day)
##         date steps
## 1 2012-10-02   126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015

What is mean total number of steps taken per day?

mea=mean(day$steps)
med=median(day$steps)

rbind(mea,med)
##         [,1]
## mea 10766.19
## med 10765.00
p1=ggplot(data=day,aes(x=steps))+
        geom_histogram(bins=30,fill="plum2")+
        ggtitle("Steps taken per day")
        

p2=p1+geom_vline(xintercept = mean(day$steps),size=1,
                 colour="blue",alpha=0.5,linetype="dashed")
                   
p3=p2+ annotate(geom = "text",x=mean(day$steps),y = 7.5,label="mean",angle=90)

p3

What is the average daily activity pattern?

#aggregating data by interval:
inter<-aggregate(steps~interval,activity,mean)
head(inter)
##   interval     steps
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396
plot(x=inter$interval,y=inter$steps,
     type = "l",
     col="red",
     main = "Average daily activity pattern")

# 5-minute interval that contains the maximum number of steps
which.max(inter$steps)
## [1] 104
inter[104,]
##     interval    steps
## 104      835 206.1698
#answer
paste("The interval is 835")
## [1] "The interval is 835"

Imputing missing values

#total missing values:
sum(is.na(activity))
## [1] 2304
#with mice package:
md.pattern(activity,plot=T)

##       date interval steps     
## 15264    1        1     1    0
## 2304     1        1     0    1
##          0        0  2304 2304
missing<-is.na(activity[,1])
#mean number of steps per interval
m<-mean(inter$steps)
activityfull<-activity
activityfull[missing,1]<-m
head(activityfull)
##     steps       date interval
## 1 37.3826 2012-10-01        0
## 2 37.3826 2012-10-01        5
## 3 37.3826 2012-10-01       10
## 4 37.3826 2012-10-01       15
## 5 37.3826 2012-10-01       20
## 6 37.3826 2012-10-01       25
#aggregating data by day:
dayfull<-aggregate(steps~date,activityfull,sum)
head(dayfull)
##         date    steps
## 1 2012-10-01 10766.19
## 2 2012-10-02   126.00
## 3 2012-10-03 11352.00
## 4 2012-10-04 12116.00
## 5 2012-10-05 13294.00
## 6 2012-10-06 15420.00
mea_full=mean(dayfull$steps)
med_full=median(dayfull$steps)

rbind(mea_full,med_full)
##              [,1]
## mea_full 10766.19
## med_full 10766.19
g1=ggplot(data=dayfull,aes(x=steps))+
        geom_histogram(bins=30,fill="blue")+
        ggtitle("Steps taken per day")
        
g1

rbind(mea,mea_full,med,med_full)
##              [,1]
## mea      10766.19
## mea_full 10766.19
## med      10765.00
## med_full 10766.19

Since I imputed the missing values by the mean number of steps per interval, there is no difference in mean before and after imputing, that is not surprising. The median has changed a little bit.

Are there differences in activity patterns between weekdays and weekends?

activityfull$date<-as.Date(activityfull$date)

activityfull$days<-weekdays(activityfull$date)



str(activityfull)
## 'data.frame':    17568 obs. of  4 variables:
##  $ steps   : num  37.4 37.4 37.4 37.4 37.4 ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
##  $ days    : chr  "lundi" "lundi" "lundi" "lundi" ...
activityfull<-activityfull%>%mutate(daytype=ifelse(days=="samedi"|days=="dimanche", "weekend","weekday"))

activityfull$daytype<-as.factor(activityfull$daytype)
str(activityfull)
## 'data.frame':    17568 obs. of  5 variables:
##  $ steps   : num  37.4 37.4 37.4 37.4 37.4 ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
##  $ days    : chr  "lundi" "lundi" "lundi" "lundi" ...
##  $ daytype : Factor w/ 2 levels "weekday","weekend": 1 1 1 1 1 1 1 1 1 1 ...
#with aggregate:
agg<-aggregate(steps ~ daytype+interval, data=activityfull, FUN=mean)

xyplot(steps~interval | daytype,
       data=agg,
       type="l",
      main = "Total Number of Steps within Intervals by dayType",
      xlab = "Daily Intervals",
      ylab = "Average Number of Steps",
       layout=c(1,2))