setwd("F:\\Coursera\\Course 5 Reproducible Research\\Project 1\\uploadproj1\\RepData_PeerAssessment1")
step_data <- read.csv("activity.csv", stringsAsFactors = FALSE)
step_data$date <- as.Date(step_data$date, format = "%Y-%m-%d")
# removing NA values
clean_step_data <- subset(step_data, !is.na(step_data$steps))
# code getting total number of steps taken each day
sum_steps <- aggregate(steps ~ date, data = clean_step_data, sum)
library(ggplot2)
# making histogram using ggplot2
g<- ggplot(data = sum_steps, aes(sum_steps$steps))
g + geom_histogram(binwidth = 500) +
labs(title = "Frequency of Step Totals", x = "Total Steps", y= "Frequency") +
ylim(0,10.5)
overall_mean <- mean(sum_steps$steps)
overall_median <- median(sum_steps$steps)
overall_mean
## [1] 10766.19
overall_median
## [1] 10765
The mean is 1.076618910^{4}. The median is 10765.
# finding mean number of steps for each interval
mean_steps_int <- aggregate(steps ~ interval, data = clean_step_data, mean)
# making plot of average steps taken per interval
with(mean_steps_int, plot(interval, steps, type = "l",
main = "Average Steps Taken Per Interval",
xlab = "minutes", ylab = "number of steps"))
interval_steps <- aggregate(steps ~ interval, data = step_data, na.action = na.pass, na.rm = TRUE, mean)
max_steps <- max(interval_steps$steps)
maxint <- subset(interval_steps, steps == max_steps)$interval
Interval 835 has the maximum number of steps on average.
missingsteps <- sum(is.na(step_data$steps))
There are 2304 missing steps in the dataset.
# Function for getting mean steps in an interval
getMeanSteps <- function(interval){
interval_steps[interval_steps$interval == interval,]$steps
}
# loops through data set and replaces all NA's with mean for the
# corresponding interval
for(i in 1:nrow(step_data)){
if(is.na(step_data[i,]$steps)){
step_data$steps[i] <- getMeanSteps(step_data$interval[i])
}
}
library(ggplot2)
# code getting total number of steps taken each day
sum_steps_nomiss <- aggregate(steps ~ date, data = step_data, sum)
# making histogram
g<- ggplot(data = sum_steps_nomiss, aes(sum_steps_nomiss$steps))
g + geom_histogram(binwidth = 500) + ylim(0,12) +
labs(title = "Frequency of Step Totals", x = "Total Steps", y= "Frequency")
# Calculating Mean total number of steps per day
new_mean <- mean(sum_steps_nomiss$steps)
new_median <- median(sum_steps_nomiss$steps)
new_mean
## [1] 10766.19
new_median
## [1] 10766.19
The original mean is 1.076618910^{4}. The original median is 10765. Then we replaced the NAs with the mean for each interval. The new mean is 1.076618910^{4} and the new median is 1.076618910^{4}.
After replacing the missing values with the average for their corresponding interval, the mean stayed the same and the median increased by 1.
In the histograms the data is less skewed and more normally distributed around the mean after the missing values are replaced.
# creating factor variable showing whether day is a weekday
step_data$weekend <- weekdays(step_data$date) %in% c("Saturday", "Sunday")
# relabeling TRUE with weekend False with Weekday
for (i in 1: nrow(step_data)){
if(step_data$weekend[i] == TRUE){
step_data$weekend[i] <- "Weekend"
} else if (step_data$weekend[i] == FALSE){
step_data$weekend[i]<- "Weekday"
}
}
# getting averages for weekend intervals and weekday intervals
dow_means <- aggregate(steps ~ interval + weekend, step_data, mean)
library(lattice)
xyplot(steps ~ interval| weekend, data = dow_means, type = 'l', layout = c(1,2),
main = "Average Steps Per Day by Interval")