Reproducible Research: Peer Assessment 1

Loading and preprocessing the data

data <- read.csv("activity.csv", na.strings = "NA")
data$date = as.Date(strptime(data$date, format = "%Y-%m-%d"))

What is mean total number of steps taken per day?

dataAc <- data[complete.cases(data), ]
s <- split(dataAc, data$date)

## Warning: data length is not a multiple of split variable

total_steps <- sapply(s, function(x) sum(x[, "steps"]))
hist(total_steps)

plot of chunk histogram

## total_steps<-na.omit(total_steps)
m <- mean(total_steps)
md <- median(total_steps)

Mean of total mumber of steps taken per day = 9354.2295

Median of total number of steps taken per day = 10395

What is the average daily activity pattern?

## dataAc<-data[complete.cases(data),]
da <- split(data, data$interval)
daily_activity <- sapply(da, function(x) mean(x[, "steps"], na.rm = TRUE))
data_dailyActivity <- cbind(as.integer(names(daily_activity)), as.numeric(daily_activity[]))
data_dailyActivity <- data.frame(data_dailyActivity)
colnames(data_dailyActivity) <- c("interval", "steps")
with(data_dailyActivity, plot(interval, steps, type = "l"))

plot of chunk time series

max_steps <- subset(data_dailyActivity, steps == max(data_dailyActivity$steps))[1, 
    1]

Interval 835, on average across all the days in the dataset, contains the maximum number of steps

Imputing missing values

missing_values <- nrow(data) - nrow(data[complete.cases(data), ])

There are 2304 rows in the dataset that includes NA values. My strategy for filling these missing values in the dataset is to use mean for that 5-minute interval.

## Adding new column includes avarega steps per interval
data["averageStepsPerInterval"] <- data_dailyActivity$steps
for (i in 1:nrow(data)) {
    if (is.na(data[i, 1])) 

    data[i, 1] <- data[i, 4]
}
## Histogram of total number of steps taking each day and calculating mean
## and median
n <- split(data, data$date)
new_total_steps <- sapply(n, function(x) sum(x[, "steps"]))
hist(new_total_steps)

plot of chunk replacing missing values

m = mean(new_total_steps)
md = median(new_total_steps)

Mean of total mumber of steps taken per day after NA replacement = 1.0766 × 10⁴

Median of total mumber of steps taken per day after NA replacement = 1.0766 × 10⁴

Median and mean values changed.

Are there differences in activity patterns between weekdays and weekends?

library(ggplot2)
data["WeekdayorWeekend"] <- data$date
data$WeekdayorWeekend <- as.character(data$WeekdayorWeekend)
for (i in 1:nrow(data)) {
    if (data[i, 2] == "Saturday" | data[i, 2] == "Sunday") {
        data[i, 5] <- "weekend"
    } else {
        data[i, 5] <- "weekday"
    }
}

## Error: character string is not in a standard unambiguous format


dt <- split(data, data$interval)
new_daily_activity <- sapply(dt, function(x) mean(x[, "steps"]))
new_data_dailyActivity <- cbind(as.integer(names(new_daily_activity)), as.numeric(new_daily_activity[]), 
    data$WeekdayorWeekend)
new_data_dailyActivity <- data.frame(new_data_dailyActivity)
colnames(new_data_dailyActivity) <- c("interval", "steps", "wd")
new_data_dailyActivity$interval <- as.integer(new_data_dailyActivity$interval)
new_data_dailyActivity$steps <- as.numeric(new_data_dailyActivity$steps)
q <- ggplot(new_data_dailyActivity, aes(y = steps, x = interval))
q + facet_grid(. ~ wd) + geom_line()

plot of chunk activity pattern

library()
qplot(votes, rating, data = movies)

plot of chunk activity pattern

qplot(votes, rating, data = movies, panel = panel.loess)

## Error: object 'panel.loess' not found

qplot(votes, rating, data = movies, smooth = "loess")

plot of chunk activity pattern