This documents analyzed the activity data from wearable device. First, it analyzed mean total number of steps taken per day. Second, it also analyzed average daily activity pattern. Lastly, it analyzed the different of walking pattern between weekdays and weekend.
## Open the file
dir <- "D:\\Data specialist\\Reproducible Research\\Data"
setwd(dir)
Data1 <- read.csv("activity.csv")
The Salmon line indicates the median, the red line indicates the mean.
Sum1 <- ddply(Data1, c("date"), summarise, Sum = sum(steps, na.rm = TRUE))
CalDataM <- mean(Sum1$Sum); CalDataMd <- median(Sum1$Sum)
g1 <- ggplot(data = Sum1, aes(x=Sum))
g2 <- g1 + geom_histogram() + ggtitle("Total number of steps taken each day") + stat_bin(bins = 55) +
geom_vline(xintercept = CalDataM, colour = "red", lwd = 1.5) + geom_vline(xintercept = CalDataMd, colour = "salmon", lwd = 1.5)
g2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
we <- as.data.frame(c(CalDataM, CalDataMd)); colnames(we) <- "Value";rownames(we) <- c("Mean", "Median")
kable(we)
| Value | |
|---|---|
| Mean | 9354.23 |
| Median | 10395.00 |
The red point means the maximum value in the daily activity pattern.
MeanData <- ddply(Data1, c("interval"), summarise,
Mean = mean(steps, na.rm = TRUE))
g3 <- ggplot(MeanData, aes(x=interval, y=Mean))
g4 <- g3 + geom_line(na.rm=TRUE) + ggtitle("Average Step Taken for interval") + labs(x = "Interval", y = "Average Step") +
geom_point(aes(x=subset(MeanData, Mean == max(MeanData$Mean))[[1]],
y =subset(MeanData, Mean == max(MeanData$Mean))[[2]]),
colour = "red")
g4
subset(MeanData, Mean == max(MeanData$Mean))
## interval Mean
## 104 835 206.1698
13% is huge to be ignored.
A <- sum(is.na(Data1$steps)); B <- sum(!is.na(Data1$steps));
C <- round(A/(A+B)*100); D <- round(B/(A+B)*100)
LA1 <- list(A,B);LA2 <- list(C,D)
LA1<- do.call(rbind,LA1); LA2<- do.call(rbind,LA2)
LA<- data.frame(cbind(LA1,LA2)); colnames(LA) <- c("Value","Per cent"); rownames(LA) <- c("NA","Not NA")
kable(LA)
| Value | Per cent | |
|---|---|---|
| NA | 2304 | 13 |
| Not NA | 15264 | 87 |
I filled the value containg NA with the average walk per interval.
D <- Data1
D1 <- Data1[is.na(Data1$steps),]
bb <- list()
for(i in 1:nrow(MeanData)) {
a <- subset(D1, interval == MeanData$interval[i])
a$steps <- MeanData[i,2]
bb[[i]] <- a
}
Nd <- do.call("rbind", bb)
D[is.na(Data1$steps),1] <- Nd$steps
Sum2 <- ddply(D, c("date"), summarise,
SUM = sum(steps))
CalData1M <- mean(Sum2$SUM); CalData1Md <- median(Sum2$SUM)
SGraph <- ggplot(Sum2, aes(x=SUM)) + geom_histogram() +
ggtitle("Total number of steps taken each day /nwithout NA") +
labs(x = "Walk", y = "Frequency") +
geom_vline(xintercept = CalData1M, colour = "red", lwd = 1.0) +
geom_vline(xintercept = CalData1Md, colour = "salmon", lwd = 1.0)
SGraph
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
a <- c("NA", "W/o NA"); b <- c("Mean", "Median");
c <- c(CalDataM,CalDataMd);d <- c(CalData1M,CalData1Md);
cd <- data.frame(do.call(rbind, list(c,d))); rownames(cd) <- a; colnames(cd) <- b
kable(cd, digits=2)
| Mean | Median | |
|---|---|---|
| NA | 9354.23 | 10395 |
| W/o NA | 10766.19 | 11015 |
One thing to note is that there is high activity from 600 to 1000 in the weekday, especially from 800 to 1000 range. In weekend, although the activity from 600 to 1000 range was relatively low, it shows the high activity from 1000 to 2000 compared to the weekdays.
Wd <- D
Wd$date <- as.Date(D$date, "%Y-%m-%d")
Wd$DayWeek <- weekdays(Wd$date)
Wd$DayWeek[Wd$DayWeek == "Sunday" | Wd$DayWeek == "Saturday"] <- "Weekend"
Wd$DayWeek[Wd$DayWeek != "Weekend"] <- "Weekday"
Wd$DayWeek <- factor(Wd$DayWeek)
CalData3 <- ddply(Wd, c("interval", "DayWeek"), summarise,
Mean = mean(steps, na.rm = TRUE))
DwGraph <- ggplot(CalData3, aes(x = interval, y = Mean)) + geom_line(stat = "identity") + facet_grid(DayWeek~.)
DwGraph