setwd("~/Desktop/Data Science Specialization/data")
if(!file.exists("./data")){dir.create("./data")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(fileUrl,destfile="./data/Dataset.zip")
unzip(zipfile="./data/Dataset.zip",exdir="./data")
activitydata<- read.csv("~/Desktop/Data Science Specialization/data/activity.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
head(activitydata)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
dim(activitydata)
## [1] 17568 3
glimpse(activitydata)
## Observations: 17,568
## Variables: 3
## $ steps <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ date <fctr> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 10...
summary(activitydata)
## steps date interval
## Min. : 0.00 2012-10-01: 288 Min. : 0.0
## 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
## Median : 0.00 2012-10-03: 288 Median :1177.5
## Mean : 37.38 2012-10-04: 288 Mean :1177.5
## 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
## Max. :806.00 2012-10-06: 288 Max. :2355.0
## NA's :2304 (Other) :15840
activitydata$date<- as.Date(activitydata$date)
Mean total number of steps taken per day
1.Calculate the total number of steps taken per day:
Total_Steps<- activitydata%>%
group_by(date)%>%
filter(!is.na(steps))%>%
summarise(total_steps = sum(steps, na.rm=TRUE))
Total_Steps
## # A tibble: 53 × 2
## date total_steps
## <date> <int>
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
## 7 2012-10-09 12811
## 8 2012-10-10 9900
## 9 2012-10-11 10304
## 10 2012-10-12 17382
## # ... with 43 more rows
2. Plot using ggplot:
ggplot(Total_Steps, aes(x = total_steps)) +
geom_histogram(fill = "blue", binwidth = 1000) +
labs(title = "Daily Steps", x = "Total Steps", y = "Frequency")

Average daily activity pattern
1. Calculating Avg. Steps:
Interval<- activitydata%>%
group_by(interval)%>%
filter(!is.na(steps))%>%
summarise(avg_steps = mean(steps, na.rm=TRUE))
Interval
## # A tibble: 288 × 2
## interval avg_steps
## <int> <dbl>
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
## 7 30 0.5283019
## 8 35 0.8679245
## 9 40 0.0000000
## 10 45 1.4716981
## # ... with 278 more rows
1.a Plotting Avg. Steps:
ggplot(Interval, aes(x =interval , y=avg_steps)) +
geom_line(color="blue", size=1) +
labs(title = "Avg. Daily Steps", x = "Interval", y = "Avg. Steps per day")

2. Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
Interval[which.max(Interval$avg_steps),]
## # A tibble: 1 × 2
## interval avg_steps
## <int> <dbl>
## 1 835 206.1698
Imputing missing values
1. Calculate total number of missing values in the dataset:
sum(is.na(activitydata$steps))
## [1] 2304
2. Imputing missing values using mean for each day and 3. Create a new dataset that is equal to the original dataset but with the missing data filled in:
activitydata2<- activitydata
nas<- is.na(activitydata2$steps)
avg_interval<- tapply(activitydata2$steps, activitydata2$interval, mean, na.rm=TRUE, simplify = TRUE)
activitydata2$steps[nas] <- avg_interval[as.character(activitydata2$interval[nas])]
names(activitydata2)
## [1] "steps" "date" "interval"
4. Check if no missing value is appearing:
sum(is.na(activitydata2))
## [1] 0
5. Reorder columns (for better understanding of the data):
activitydata2<- activitydata2[, c("date", "interval", "steps")]
head(activitydata2)
## date interval steps
## 1 2012-10-01 0 1.7169811
## 2 2012-10-01 5 0.3396226
## 3 2012-10-01 10 0.1320755
## 4 2012-10-01 15 0.1509434
## 5 2012-10-01 20 0.0754717
## 6 2012-10-01 25 2.0943396
Differences in activity patterns between weekdays and weekends
head(activitydata2)
## date interval steps
## 1 2012-10-01 0 1.7169811
## 2 2012-10-01 5 0.3396226
## 3 2012-10-01 10 0.1320755
## 4 2012-10-01 15 0.1509434
## 5 2012-10-01 20 0.0754717
## 6 2012-10-01 25 2.0943396
5.1 Create new varibale called WeekType for Weekday & Weekend:
activitydata2<- activitydata2%>%
mutate(weektype= ifelse(weekdays(activitydata2$date)=="Saturday" | weekdays(activitydata2$date)=="Sunday", "Weekend", "Weekday"))
head(activitydata2)
## date interval steps weektype
## 1 2012-10-01 0 1.7169811 Weekday
## 2 2012-10-01 5 0.3396226 Weekday
## 3 2012-10-01 10 0.1320755 Weekday
## 4 2012-10-01 15 0.1509434 Weekday
## 5 2012-10-01 20 0.0754717 Weekday
## 6 2012-10-01 25 2.0943396 Weekday
Plotting:
Interval2<- activitydata2%>%
group_by(interval, weektype)%>%
summarise(avg_steps2 = mean(steps, na.rm=TRUE))
head(Interval2)
## Source: local data frame [6 x 3]
## Groups: interval [3]
##
## interval weektype avg_steps2
## <int> <chr> <dbl>
## 1 0 Weekday 2.25115304
## 2 0 Weekend 0.21462264
## 3 5 Weekday 0.44528302
## 4 5 Weekend 0.04245283
## 5 10 Weekday 0.17316562
## 6 10 Weekend 0.01650943
plot<- ggplot(Interval2, aes(x =interval , y=avg_steps2, color=weektype)) +
geom_line() +
labs(title = "Avg. Daily Steps by Weektype", x = "Interval", y = "No. of Steps") +
facet_wrap(~weektype, ncol = 1, nrow=2)
print(plot)

Answer: Yes there are some differences. During weekdays activity is greatest in the morning, however overall there is more activity on weekends than weekdays.