## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'ggthemes' was built under R version 4.0.2
First, you have to choose your working directory, and next, using read.csv() function to introduce the dataset in the environment
After we have our dataset, we can make some transformations:
activity$date<-format(as.Date(activity$date, format ="%Y-%m-%d"), format = "%d-%m-%y") #Easilly reading
mean(is.na(activity$steps))# knowledge about NA's## [1] 0.1311475
sumTable <- aggregate(activity$steps ~ activity$date, FUN=sum, )
colnames(sumTable)<- c("Date", "Steps")
# We also can make an histogram with ggplot2
ggplot(sumTable, aes(x=Steps)) + geom_histogram(bins = 5,color="black", fill="tomato2") +
theme_economist() +labs(title = "Total Steps per day") + xlab("Steps") + ylab("Frequency")Now we are going to calculate the mean and median of the total number of steps taken per day
## [1] 10766
## [1] 10765
The median of steps per day it’s 10766, and the median it’s 10765.
First of all, we are going to delete the NA’s observations, in the Step variable, and next, we are goint to plotting the results:
## Warning: package 'plyr' was built under R version 4.0.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
clean <- activity[!is.na(activity$steps),]
intervalTable <- ddply(clean, .(interval), summarize, Avg = mean(steps))
clean <- activity[!is.na(activity$steps),]
interval_Table <- ddply(clean, .(interval), summarize, Avg = mean(steps))
ggplot(interval_Table, aes(x=interval, y=Avg), xlab = "Interval", ylab="Average Number of Steps") +
geom_line()+xlab("Interval")+ylab("Average Number of Steps")+ggtitle("Average Number of Steps per Interval") +
theme_economist() The 5-minute interval that, on average, contains the maximum number of steps:
## [1] 206
## [1] 835
The maximum number of steps for a 5-minute interval is 206 steps.
The 5-minute interval which have the maximum number of steps is the 835 interval.
## [1] 2304
## [1] 0.1311475
We have 2304 columns with NA’s observations (13% aprox.) The method that i’m goint to follow is to ubstitute the missing steps with the average 5-minute interval based on the day of the week.
library(plyr)
library(dplyr)
clean <- activity[!is.na(activity$steps),]
intervalTable <- ddply(clean, .(interval), summarize, Avg = mean(steps))
avgTable <- ddply(clean, .(interval, day), summarize, Avg = mean(steps))
nadata<- activity[is.na(activity$steps),]
newdata<-merge(nadata, avgTable, by=c("interval", "day"))Create a new dataset that is equal to the original dataset but with the missing data filled in.
newdata2<- newdata[,c(6,4,1,2,5)]
colnames(newdata2)<- c("steps", "date", "interval", "day", "DateTime")
mergeData <- rbind(clean, newdata2)Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?:
sumTable2 <- aggregate(mergeData$steps ~ mergeData$date, FUN=sum, )
colnames(sumTable2)<- c("Date", "Steps")
as.integer(mean(sumTable2$Steps))## [1] 10807
## [1] 11015
Histogram of the total number of steps taken each day after missing values are removed, using ggplot2:
ggplot() +
geom_histogram(data = sumTable2, aes(x = Steps), bins = 5,color="black", fill="black") +
geom_histogram(data = sumTable, aes(x = Steps),bins = 5,color = "black", fill = "tomato2") +
theme_economist() The new mean of the imputed data is 10821 steps compared to the old mean of 10766 steps. The new median of the imputed data is 11015 steps compared to the old median of 10765 steps.
I’m goint to create a new factor variable in the dataset with two levels - “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
library(lattice)
mergeData$DayCategory <- ifelse(mergeData$day %in% c("sabado", "domingo"), "Weekend", "Weekday")It’s important to check the lenguaje of your dataset, because for example i had it in Spanish, and i have got to put “sabado” and "domingo in the mergeData$day colum. If you have it in English you have got to put “Saturday” and “Sunday”
intervalTable2 <- ddply(mergeData, .(interval, DayCategory), summarize, Avg = mean(steps))
xyplot(Avg~interval|DayCategory, data=intervalTable2, type="l", layout = c(1,2),
main="Average Steps per Interval Based on Type of Day",
ylab="Average Number of Steps", xlab="Interval")Yes, the step activity trends are different based on whether the day occurs on a weekend or not.
Francisco Javier Carela Ferrer