#Loading Libraries
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(lattice)
#loading the dataset
activity <- read.csv("activity.csv")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
What is mean total number of steps taken per day? 1. Calculate the total number of steps taken per day
Steps_total_per_day <- aggregate(steps~date, activity, sum)
head(Steps_total_per_day)
## date steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
mean_per_day <- aggregate(steps~date, activity, mean)
head(mean_per_day, 10)
## date steps
## 1 2012-10-02 0.43750
## 2 2012-10-03 39.41667
## 3 2012-10-04 42.06944
## 4 2012-10-05 46.15972
## 5 2012-10-06 53.54167
## 6 2012-10-07 38.24653
## 7 2012-10-09 44.48264
## 8 2012-10-10 34.37500
## 9 2012-10-11 35.77778
## 10 2012-10-12 60.35417
hist(Steps_total_per_day$steps,
col = "orange",
main = "Total Number of Steps Taken Each Day",
xlab = "Steps",
ylab = "Number of Days")
3a. mean calculation
mean_of_steps <- mean(Steps_total_per_day$steps)
print(mean_of_steps)
## [1] 10766.19
3b. median calculation
median_steps <- median(Steps_total_per_day$steps)
print(median_steps)
## [1] 10765
What is the average daily activity pattern?
Make a time series plot (type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis) 1.1 The average number of steps taken & Time series plot:
average_steps <- aggregate(steps~interval, activity, mean)
with(average_steps,
plot(interval,
steps,
type = "l",
col= "red",
main = "Time Series Plot"))
interval_max <- average_steps[which.max(average_steps$steps),1]
print(interval_max)
## [1] 835
Inputing missing values Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
missing_values <- sum(is.na(activity$steps))
print(missing_values)
## [1] 2304
Answer 1: The total number of rows with NAs is 2304)
Replacing the missing values with mean for that 5-minute interval 2.1 Averaging: the mean number of steps per Interval
mean_values <- mean(average_steps$steps)
print(mean_values)
## [1] 37.3826
Create a new dataset that is equal to the original dataset but with the missing data filled in. 3.1 Create a new dataset
new_df <- activity
3.2 Fill the new_df with mean values (37.3826)
new_df[is.na(new_df)] <- mean_values
head(new_df)
## steps date interval
## 1 37.3826 2012-10-01 0
## 2 37.3826 2012-10-01 5
## 3 37.3826 2012-10-01 10
## 4 37.3826 2012-10-01 15
## 5 37.3826 2012-10-01 20
## 6 37.3826 2012-10-01 25
Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
totalstepsnew_df <- aggregate(steps ~ date, new_df, sum)
hist(totalstepsnew_df$steps,
main = "Total Steps Taken Each day - Replacing NA",
ylab = "Number of Days",
xlab = "Steps",
col = "gray")
4.1 Calculate mean
mean_steps_new_df <- mean(totalstepsnew_df$steps)
head(mean_steps_new_df)
## [1] 10766.19
4.2 Calulate median
median_steps_new_df <- median(totalstepsnew_df$steps)
print(median_steps_new_df)
## [1] 10766.19
Do these values differ from # the estimates from the first part of the assignment? There is insignificant difference. When I was replacing the NA values I got an increase about number of days but insignificant difference between previous values of mean and median
What is the impact of inputing missing data on the estimates of the total daily number of steps? The number of days increased.
Are there differences in activity patterns between weekdays and weekends? For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
Create a new factor variable in the dataset with two levels “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
new_df$date <- as.Date(new_df$date)
df <- new_df %>%
mutate(dayofweek = ifelse(weekdays(new_df$date) == "Saturday" | weekdays(new_df$date) == "Sunday","weekend","weekday"))
df2<-df %>%
group_by(dayofweek, interval) %>%
summarize(sumsteps=sum(steps))
## `summarise()` has grouped output by 'dayofweek'. You can override using the
## `.groups` argument.
display of the data table after summarization
head(df2)
## # A tibble: 6 × 3
## # Groups: dayofweek [1]
## dayofweek interval sumsteps
## <chr> <int> <dbl>
## 1 weekday 0 315.
## 2 weekday 5 242.
## 3 weekday 10 231.
## 4 weekday 15 232.
## 5 weekday 20 228.
## 6 weekday 25 283.
with(df2,
xyplot(sumsteps ~ interval | dayofweek,
type = "l",
main = "Total Number of Steps within Intervals by dayofweek",
xlab = "Daily Intervals",
ylab = "Average Number of Steps"))