Synopsis

This documents analyzed the activity data from wearable device. First, it analyzed mean total number of steps taken per day. Second, it also analyzed average daily activity pattern. Lastly, it analyzed the different of walking pattern between weekdays and weekend.

Open file

## Open the file 
dir <- "D:\\Data specialist\\Reproducible Research\\Data"
setwd(dir)
Data1 <- read.csv("activity.csv")

What is mean total number of steps taken per day?

The Salmon line indicates the median, the red line indicates the mean.

Sum1 <- ddply(Data1, c("date"), summarise, Sum = sum(steps, na.rm = TRUE))
CalDataM <- mean(Sum1$Sum); CalDataMd <- median(Sum1$Sum)
g1 <- ggplot(data = Sum1, aes(x=Sum))
g2 <- g1 + geom_histogram() + ggtitle("Total number of steps taken each day") + stat_bin(bins = 55) +
        geom_vline(xintercept = CalDataM, colour = "red", lwd = 1.5) + geom_vline(xintercept = CalDataMd, colour = "salmon", lwd = 1.5)
g2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Table for Calculated Mean and Median of total number of steps per day

we <- as.data.frame(c(CalDataM, CalDataMd)); colnames(we) <- "Value";rownames(we) <- c("Mean", "Median")
kable(we)
Value
Mean 9354.23
Median 10395.00

What is the average daily activity pattern?

The red point means the maximum value in the daily activity pattern.

MeanData <- ddply(Data1, c("interval"), summarise, 
                 Mean = mean(steps, na.rm = TRUE))
g3 <- ggplot(MeanData, aes(x=interval, y=Mean))
g4 <- g3 + geom_line(na.rm=TRUE) + ggtitle("Average Step Taken for interval") + labs(x = "Interval", y = "Average Step") + 
        geom_point(aes(x=subset(MeanData, Mean == max(MeanData$Mean))[[1]],
                       y =subset(MeanData, Mean == max(MeanData$Mean))[[2]]),
                   colour = "red")
g4

Interval including maximum value of average walk.

subset(MeanData, Mean == max(MeanData$Mean))
##     interval     Mean
## 104      835 206.1698

Imputing missing values

Table for The number of NA in the data set

13% is huge to be ignored.

A <- sum(is.na(Data1$steps)); B <- sum(!is.na(Data1$steps));
C <- round(A/(A+B)*100); D <- round(B/(A+B)*100)
LA1 <- list(A,B);LA2 <- list(C,D)
LA1<- do.call(rbind,LA1); LA2<- do.call(rbind,LA2)
LA<- data.frame(cbind(LA1,LA2)); colnames(LA) <- c("Value","Per cent"); rownames(LA) <- c("NA","Not NA")
kable(LA)
Value Per cent
NA 2304 13
Not NA 15264 87

Fill NA with the Average per Interval

I filled the value containg NA with the average walk per interval.

D <- Data1
D1 <- Data1[is.na(Data1$steps),]
bb <- list()

for(i in 1:nrow(MeanData)) {
        a <- subset(D1, interval == MeanData$interval[i])
        a$steps <- MeanData[i,2]
        bb[[i]] <- a
}
Nd <- do.call("rbind", bb)
D[is.na(Data1$steps),1] <- Nd$steps

Check the difference between analysis with NA and without.

Sum2 <- ddply(D, c("date"), summarise, 
                  SUM = sum(steps))

CalData1M <- mean(Sum2$SUM); CalData1Md <- median(Sum2$SUM)

SGraph <- ggplot(Sum2, aes(x=SUM)) + geom_histogram() + 
        ggtitle("Total number of steps taken each day /nwithout NA") + 
        labs(x = "Walk", y = "Frequency") + 
        geom_vline(xintercept = CalData1M, colour = "red", lwd = 1.0) + 
        geom_vline(xintercept = CalData1Md, colour = "salmon", lwd = 1.0)
        
SGraph
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Table indicating the change of meaan and median.

a <- c("NA", "W/o NA"); b <- c("Mean", "Median");
c <- c(CalDataM,CalDataMd);d <- c(CalData1M,CalData1Md); 
cd <- data.frame(do.call(rbind, list(c,d))); rownames(cd) <- a; colnames(cd) <- b
kable(cd, digits=2)
Mean Median
NA 9354.23 10395
W/o NA 10766.19 11015

The difference between weekday and weekend

One thing to note is that there is high activity from 600 to 1000 in the weekday, especially from 800 to 1000 range. In weekend, although the activity from 600 to 1000 range was relatively low, it shows the high activity from 1000 to 2000 compared to the weekdays.

Wd <- D
Wd$date <- as.Date(D$date, "%Y-%m-%d")

Wd$DayWeek <- weekdays(Wd$date)
Wd$DayWeek[Wd$DayWeek == "Sunday" | Wd$DayWeek == "Saturday"] <- "Weekend"
Wd$DayWeek[Wd$DayWeek != "Weekend"] <- "Weekday"
Wd$DayWeek <- factor(Wd$DayWeek)

CalData3 <- ddply(Wd, c("interval", "DayWeek"), summarise, 
      Mean = mean(steps, na.rm = TRUE))

DwGraph <- ggplot(CalData3, aes(x = interval, y = Mean)) + geom_line(stat = "identity") + facet_grid(DayWeek~.) 
DwGraph