#Course Project 1 for Reproducible Research
#Loading Data
raw_data <- read.csv("activity.csv",header = TRUE)
#Converts all the character values from the date column to a date format
raw_data$date <- as.Date(raw_data$date, "%Y-%m-%d")
#removes all NA values from the dataset
processed_data <- na.omit(raw_data)
# This Code calculates the total number of steps per day
Total_Steps_by_Date <- aggregate(steps ~ date, data = processed_data, FUN = sum)
# This Code shows a histogram of the steps, it is worth noticing that the size is set to 500 in order to show a condensed readable data
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
Histogram_Steps_day <- ggplot(Total_Steps_by_Date,aes(steps)) + geom_histogram( col = "white", fill = "blue", binwidth = 2500) + labs(title = "Number of Steps", x = "steps", y="Count")
print(Histogram_Steps_day)
# This Code calculates the average number of steps per day
Mean_Steps_by_Date <- mean(Total_Steps_by_Date$steps)
print(Mean_Steps_by_Date)
## [1] 10766.19
# This Code Calculates the median number of steps per day
Median_Steps_by_Date <- median(Total_Steps_by_Date$steps)
print(Median_Steps_by_Date)
## [1] 10765
# The first step is to simply create a time series (using the plot base system), of each intervale (i.e 0,5,10 up to 2355) and the average steps for each interval for each date
Average_steps_interval <- aggregate(steps ~ interval, data = processed_data, FUN = mean)
# 1.- Make a time series plot (i.e. type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
plot(Average_steps_interval$interval,Average_steps_interval$steps, xlab = "5 Minute Interval", ylab = "Average Number of Steps", type ="l" , main = "Time Series - Average Steps per 5 Minutes Interval")
# 2.- Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
i <- which.max(Average_steps_interval$steps)
Maximum_5minute_interval<- Average_steps_interval[i,1]
print(Maximum_5minute_interval)
## [1] 835
Number_of_NA <- sapply(raw_data, function(x) sum(is.na(x)))
NA_in_Steps <- Number_of_NA[1]
# The number of NA's in the dataset is equal to:
print(NA_in_Steps)
## steps
## 2304
#copying the raw_data to a new variable just in case
Missing_Values_data <- raw_data
vector <- which(is.na(Missing_Values_data$steps))
# Firstly we identify the NA values and we match this missing values
# to the average value per interval (5 minutes)
for(i in vector){
m <- Missing_Values_data[i,3]
t <- filter(Average_steps_interval, Average_steps_interval$interval == m)
v <- t[1,2]
Missing_Values_data$steps[i] = v
}
#Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
Total_Steps_MV <- aggregate(steps ~ interval, data = Missing_Values_data, FUN = sum)
#Histogram
library(ggplot2)
Histogram_Steps <- ggplot(Total_Steps_MV,aes(steps)) + geom_histogram( col = "white", fill = "blue", binwidth = 1500) + labs(title = "Number of Steps per day", x = "steps", y="Count")
print(Histogram_Steps)
#Do these values differ from the estimates from the first part of the assignment?
# The values differ significantly, as now we have more data so the mean has been redcued
# Mean Value
mean(Total_Steps_MV$steps)
## [1] 1981.278
#Median
median(Total_Steps_MV$steps)
## [1] 1808
#What is the impact of imputing missing data on the estimates of the total daily number of steps?
#Additional data increases the frequency(count) per interval
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Data_by_Weekday <- Missing_Values_data
Data_by_Weekday <- mutate(Data_by_Weekday, weekday = weekdays(date))
#Chaging the values from weekday to weekend
Data_by_Weekday$weekday <- ifelse(Data_by_Weekday$weekday %in% c("Sunday","Saturday"),"weekend","weekday")
# Average number of steps per interval (every 5 minutes), per weekend
Weekend_data <- aggregate(steps ~ interval,data = filter(Data_by_Weekday, weekday == "weekend"), FUN = mean)
Weekend_data$weektype <- "weekend"
# Average number of steps per interval (every 5 minutes), per weekday
Weekday_data <- aggregate(steps ~ interval,data = filter(Data_by_Weekday, weekday == "weekday"), FUN = mean)
Weekday_data$weektype <- "weekday"
Act_pattern_Plot <- rbind(Weekend_data,Weekday_data)
#Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.
#Do these values differ from the estimates from the first part of the assignment?
#What is the impact of imputing missing data on the estimates of the total daily number of steps?
qplot(interval,steps,data = Act_pattern_Plot, facets = weektype~.,geom = "line", xlab = "Number of Steps", ylab ="Interval")
# There is clearly a difference between Weekends and Weekdays, there is more activity during the week