#Initialize ##################################
Install all the potentially needed packages:
install_and_load = function(name, char = T)
{
if (!require(name, character.only = char))
{
install.packages(name)
}
require(name, character.only = char)
}
#install_github("dgrtwo/broom")
# library(broom)
sapply(
c("data.table","tidyverse","magrittr", "modelr",
"arules","arulesViz","readxl","dplyr", "ggplot2", "haven", "expss", "ipumsr", "devtools", "broom",
"purrr", "GGally", "cluster", "readxl", "tidyr", "writexl", "xlsx", "openxlsx", "psych", 'data.table',
"readxl", "knitr", "ExPanDaR", "kableExtra", "plm", "sampleSelection", "nnet", "reshape2", "data.table","dplyr","magrittr",
"lubridate", "scales","norm","RPostgreSQL","tidyr","gganimate","gifski","stringr","tm","readr","textclean","grid","ggpubr","gridExtra","lubridate","writexl","xlsx","RSQLite","odbc","dbplyr","seasonal","plotly","forecast","mFilter","tseries","plm","AER","stargazer","merTools","RColorBrewer","colorRamps"),
install_and_load
)
## Warning: package 'magrittr' was built under R version 4.0.2
## Warning: package 'arules' was built under R version 4.0.2
## Warning: package 'arulesViz' was built under R version 4.0.2
## Warning: package 'expss' was built under R version 4.0.2
## Warning: package 'ipumsr' was built under R version 4.0.2
## Warning: package 'devtools' was built under R version 4.0.2
## Warning: package 'usethis' was built under R version 4.0.2
## Warning: package 'GGally' was built under R version 4.0.2
## Warning: package 'xlsx' was built under R version 4.0.2
## Warning: package 'psych' was built under R version 4.0.2
## Warning: package 'ExPanDaR' was built under R version 4.0.2
## Warning: package 'kableExtra' was built under R version 4.0.2
## Warning: package 'plm' was built under R version 4.0.2
## Warning: package 'sampleSelection' was built under R version 4.0.2
## Warning: package 'maxLik' was built under R version 4.0.2
## Warning: package 'miscTools' was built under R version 4.0.2
## Warning: package 'RPostgreSQL' was built under R version 4.0.2
## Warning: package 'gifski' was built under R version 4.0.2
## Warning: package 'tm' was built under R version 4.0.2
## Warning: package 'textclean' was built under R version 4.0.2
## Warning: package 'ggpubr' was built under R version 4.0.2
## Warning: package 'odbc' was built under R version 4.0.2
## Warning: package 'seasonal' was built under R version 4.0.2
## Warning: package 'plotly' was built under R version 4.0.2
## Warning: package 'forecast' was built under R version 4.0.2
## Warning: package 'mFilter' was built under R version 4.0.2
## Warning: package 'tseries' was built under R version 4.0.2
## Warning: package 'AER' was built under R version 4.0.2
## Warning: package 'car' was built under R version 4.0.2
## Warning: package 'lmtest' was built under R version 4.0.2
## Warning: package 'sandwich' was built under R version 4.0.2
## Warning: package 'merTools' was built under R version 4.0.2
## Warning: package 'arm' was built under R version 4.0.2
## Warning: package 'lme4' was built under R version 4.0.2
## data.table tidyverse magrittr modelr arules
## TRUE TRUE TRUE TRUE TRUE
## arulesViz readxl dplyr ggplot2 haven
## TRUE TRUE TRUE TRUE TRUE
## expss ipumsr devtools broom purrr
## TRUE TRUE TRUE TRUE TRUE
## GGally cluster readxl tidyr writexl
## TRUE TRUE TRUE TRUE TRUE
## xlsx openxlsx psych data.table readxl
## TRUE TRUE TRUE TRUE TRUE
## knitr ExPanDaR kableExtra plm sampleSelection
## TRUE TRUE TRUE TRUE TRUE
## nnet reshape2 data.table dplyr magrittr
## TRUE TRUE TRUE TRUE TRUE
## lubridate scales norm RPostgreSQL tidyr
## TRUE TRUE TRUE TRUE TRUE
## gganimate gifski stringr tm readr
## TRUE TRUE TRUE TRUE TRUE
## textclean grid ggpubr gridExtra lubridate
## TRUE TRUE TRUE TRUE TRUE
## writexl xlsx RSQLite odbc dbplyr
## TRUE TRUE TRUE TRUE TRUE
## seasonal plotly forecast mFilter tseries
## TRUE TRUE TRUE TRUE TRUE
## plm AER stargazer merTools RColorBrewer
## TRUE TRUE TRUE TRUE TRUE
## colorRamps
## TRUE
rm(install_and_load)
You can do this, for example:
my_path_to_file = "C:\\Users\\Alex\\Documents\\COURSERA STUDIES\\Reproducible Research JHI\\Week_2 PS1\\"
dir(my_path_to_file) #To see if there is the file we need and to check the name of the file
## [1] "My_Assignment_R.R" "repdata_data_activity"
## [3] "repdata_data_activity.zip" "RWO"
## [5] "Variable_Description README.txt"
data_on_activities <- read.csv(unz(paste(my_path_to_file, "repdata_data_activity.zip", sep=""),"activity.csv"), header=TRUE, na.strings="NA", sep=",", quote="\"")
# Checking if the file opened OK:
str(data_on_activities) # OK - file is opened OK
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
typeof(data_on_activities)
## [1] "list"
data_on_activities = data.table(data_on_activities)
class(data_on_activities) #Data table data frame - so it is OK for the future analysis
## [1] "data.table" "data.frame"
## Datet time transformation $ date : chr "2012-10-01" "2012-10-01"
# EPL2011_12$Date2 <- as.Date( as.character(EPL2011_12$Date), "%d-%m-%y")
# library(lubridate)
data_on_activities$date1 <- as.Date( as.character(data_on_activities$date), "%Y-%m-%d")
data_suggested_for_analysis_1 <- data_on_activities[!is.na(data_on_activities$steps),]
total_steps_by_date <- aggregate(data=data_suggested_for_analysis_1,steps~date1,sum) #Done in this line
# View(total_steps_by_date) #We have a small dataset - so it is OK to view it
knitr::kable(head(total_steps_by_date, 10))
date1 | steps |
---|---|
2012-10-02 | 126 |
2012-10-03 | 11352 |
2012-10-04 | 12116 |
2012-10-05 | 13294 |
2012-10-06 | 15420 |
2012-10-07 | 11015 |
2012-10-09 | 12811 |
2012-10-10 | 9900 |
2012-10-11 | 10304 |
2012-10-12 | 17382 |
hist1 = hist(total_steps_by_date$steps,
main="Total Steps Done Per Day",
xlab="Total Steps Taken On A Daily Basis",
ylab="Number of Days We Saw This Pattern",
col="red",
ylim=range(0:45)) #Cut the OY axis till 45
hist1
## $breaks
## [1] 0 5000 10000 15000 20000 25000
##
## $counts
## [1] 5 12 28 6 2
##
## $density
## [1] 1.886792e-05 4.528302e-05 1.056604e-04 2.264151e-05 7.547170e-06
##
## $mids
## [1] 2500 7500 12500 17500 22500
##
## $xname
## [1] "total_steps_by_date$steps"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# figure/ directory
png("C:\\Users\\Alex\\Documents\\R\\HOMEPROJECTS\\Reproducible_Research_1\\figure\\directory\\figure_1.png", width=480, height=480)
hist(total_steps_by_date$steps,
main="Total Steps Done Per Day",
xlab="Total Steps Taken On A Daily Basis",
ylab="Number of Days We Saw This Pattern",
col="red",
ylim=range(0:45)) #Cut the OY axis till 45
dev.off()
## png
## 2
### Method 1:
summary(total_steps_by_date) # Done: Mean and median will be in this output
## date1 steps
## Min. :2012-10-02 Min. : 41
## 1st Qu.:2012-10-16 1st Qu.: 8841
## Median :2012-10-29 Median :10765
## Mean :2012-10-30 Mean :10766
## 3rd Qu.:2012-11-16 3rd Qu.:13294
## Max. :2012-11-29 Max. :21194
### Method 2:
require(Hmisc)
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.0.2
## Loading required package: lattice
## Loading required package: Formula
## Registered S3 methods overwritten by 'Hmisc':
## method from
## [.labelled expss
## print.labelled expss
## as.data.frame.labelled expss
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
Hmisc::describe(total_steps_by_date) #More detailed statistics
## total_steps_by_date
##
## 2 Variables 53 Observations
## --------------------------------------------------------------------------------
## date1
## n missing distinct Info Mean Gmd .05
## 53 0 53 1 2012-10-30 20.53 2012-10-04
## .10 .25 .50 .75 .90 .95
## 2012-10-07 2012-10-16 2012-10-29 2012-11-16 2012-11-23 2012-11-26
##
## lowest : 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## highest: 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## --------------------------------------------------------------------------------
## steps
## n missing distinct Info Mean Gmd .05 .10
## 53 0 53 1 10766 4684 2928 5103
## .25 .50 .75 .90 .95
## 8841 10765 13294 15108 16205
##
## lowest : 41 126 2492 3219 4472, highest: 15414 15420 17382 20427 21194
## --------------------------------------------------------------------------------
### Method 3:
mn1 <- round(mean(total_steps_by_date$steps),digits=2)
md1 <- round(median(total_steps_by_date$steps),digits=2)
mn1
## [1] 10766.19
md1
## [1] 10765
The MEAN steps per day is: 10766.19
The MEDIAN steps per day is: 10765
average_steps_by_interval <- aggregate(data=data_suggested_for_analysis_1,steps~interval,mean)
myplot1 = ggplot(average_steps_by_interval, aes(x=interval, y=steps)) +
geom_line(color="green") +
xlab("Interval") +
ylab("Steps Taken") +
ggtitle("Mean Steps Taken By Interval")
myplot1
png("C:\\Users\\Alex\\Documents\\R\\HOMEPROJECTS\\Reproducible_Research_1\\figure\\directory\\figure_2.png", width=480, height=480)
myplot1
dev.off()
## png
## 2
average_steps_by_interval_1 <- aggregate(data=data_suggested_for_analysis_1,steps~interval,median)
myplot2 = ggplot(average_steps_by_interval_1, aes(x=interval, y=steps)) +
geom_line(color="green") +
xlab("Interval") +
ylab("Steps Taken") +
ggtitle("Median Steps Taken By Interval")
myplot2
png("C:\\Users\\Alex\\Documents\\R\\HOMEPROJECTS\\Reproducible_Research_1\\figure\\directory\\figure_3.png", width=480, height=480)
myplot2
dev.off()
## png
## 2
### Method 1:
five_min_interval_Avg_all_days_maximum_N_steps <- average_steps_by_interval[average_steps_by_interval$steps==max(average_steps_by_interval$steps), ]
View(five_min_interval_Avg_all_days_maximum_N_steps) #Second column
summary(five_min_interval_Avg_all_days_maximum_N_steps) #Or all numbers here with the name Interval
## interval steps
## Min. :835 Min. :206.2
## 1st Qu.:835 1st Qu.:206.2
## Median :835 Median :206.2
## Mean :835 Mean :206.2
## 3rd Qu.:835 3rd Qu.:206.2
## Max. :835 Max. :206.2
### Method 2:
summary(average_steps_by_interval) #We see Max. :206.170
## interval steps
## Min. : 0.0 Min. : 0.000
## 1st Qu.: 588.8 1st Qu.: 2.486
## Median :1177.5 Median : 34.113
## Mean :1177.5 Mean : 37.383
## 3rd Qu.:1766.2 3rd Qu.: 52.835
## Max. :2355.0 Max. :206.170
which(average_steps_by_interval$steps > 206.169) # See the result 104
## [1] 104
average_steps_by_interval$interval[104] #We see the answer
## [1] 835
### Method 3:
which(average_steps_by_interval$steps == max(average_steps_by_interval$steps)) #Returns 104
## [1] 104
average_steps_by_interval$interval[104] #We see the answer
## [1] 835
### Method 4:
v1 = table(average_steps_by_interval$steps == max(average_steps_by_interval$steps), average_steps_by_interval$interval)
v1 = data.table(v1) #Otherwise the next step will be impossible
v2 = v1[v1$V1 == TRUE & v1$N > 0, ] #V1 - T/F variable if maximum => we need it is to be TRUE; N - number of observations where the Maximum value is True (need to be more than 0 - at least once Max value should appear)
str(v2) #We see the Answer: 835
## Classes 'data.table' and 'data.frame': 1 obs. of 3 variables:
## $ V1: chr "TRUE"
## $ V2: chr "835"
## $ N : int 1
## - attr(*, ".internal.selfref")=<externalptr>
rm(v1) #v2 is small - it doesn't hold any significant place in the PC memory
### Method I
summary(data_on_activities) #The easiest one and the most obvious
## steps date interval date1
## Min. : 0.00 Length:17568 Min. : 0.0 Min. :2012-10-01
## 1st Qu.: 0.00 Class :character 1st Qu.: 588.8 1st Qu.:2012-10-16
## Median : 0.00 Mode :character Median :1177.5 Median :2012-10-31
## Mean : 37.38 Mean :1177.5 Mean :2012-10-31
## 3rd Qu.: 12.00 3rd Qu.:1766.2 3rd Qu.:2012-11-15
## Max. :806.00 Max. :2355.0 Max. :2012-11-30
## NA's :2304
# We see the number of NAs in each solumn
### Method II
my_NA_data <- data_on_activities[is.na(data_on_activities$steps),]
my_NA_data <- count(my_NA_data)
View(my_NA_data)
length(my_NA_data) #Check the Number of columns
## [1] 1
dim(my_NA_data) #Check all NA data Number of observations
## [1] 1 1
### Method III
purrr::map(data_on_activities, ~sum(is.na(.)))
## $steps
## [1] 2304
##
## $date
## [1] 0
##
## $interval
## [1] 0
##
## $date1
## [1] 0
### Method IV
data_on_activities %>%
dplyr::summarise_all(dplyr::funs(sum(is.na(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## steps date interval date1
## 1 2304 0 0 0
### Method V
sapply(X = data_on_activities, FUN = function(x) sum(is.na(x)))
## steps date interval date1
## 2304 0 0 0
### Method VI
apply(data_on_activities, 2, function(x) length(which(is.na(x)))) #Longer in computation than the previous one
## steps date interval date1
## 2304 0 0 0
### Method VII
apply(is.na(data_on_activities), 2, sum)
## steps date interval date1
## 2304 0 0 0
### Method VIII
colSums(is.na(data_on_activities))
## steps date interval date1
## 2304 0 0 0
### Method I
summary(data_on_activities) #The easiest one and the most obvious
## steps date interval date1
## Min. : 0.00 Length:17568 Min. : 0.0 Min. :2012-10-01
## 1st Qu.: 0.00 Class :character 1st Qu.: 588.8 1st Qu.:2012-10-16
## Median : 0.00 Mode :character Median :1177.5 Median :2012-10-31
## Mean : 37.38 Mean :1177.5 Mean :2012-10-31
## 3rd Qu.: 12.00 3rd Qu.:1766.2 3rd Qu.:2012-11-15
## Max. :806.00 Max. :2355.0 Max. :2012-11-30
## NA's :2304
# We see the number of NAs in each solumn
### Method II
my_NA_data <- data_on_activities[is.na(data_on_activities$steps),]
my_NA_data <- count(my_NA_data)
View(my_NA_data)
length(my_NA_data) #Check the Number of columns
## [1] 1
dim(my_NA_data) #Check all NA data Number of observations
## [1] 1 1
### Method III
purrr::map(data_on_activities, ~sum(is.na(.)))
## $steps
## [1] 2304
##
## $date
## [1] 0
##
## $interval
## [1] 0
##
## $date1
## [1] 0
### Method IV
data_on_activities %>%
dplyr::summarise_all(dplyr::funs(sum(is.na(.))))
## steps date interval date1
## 1 2304 0 0 0
### Method V
sapply(X = data_on_activities, FUN = function(x) sum(is.na(x)))
## steps date interval date1
## 2304 0 0 0
### Method VI
apply(data_on_activities, 2, function(x) length(which(is.na(x)))) #Longer in computation than the previous one
## steps date interval date1
## 2304 0 0 0
### Method VII
apply(is.na(data_on_activities), 2, sum)
## steps date interval date1
## 2304 0 0 0
### Method VIII
colSums(is.na(data_on_activities))
## steps date interval date1
## 2304 0 0 0
remade_to_df_data_on_activities <- dplyr::tbl_df(data_on_activities)
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# average_steps_by_interval <- aggregate(data=data_suggested_for_analysis_1,steps~interval,mean)
remade_to_df_data_on_activities <- inner_join(remade_to_df_data_on_activities, average_steps_by_interval, by = "interval")
# names(remade_to_df_data_on_activities) <- c("steps","date","interval","avg", "date1")
remade_to_df_data_on_activities$steps <- ifelse(is.na(remade_to_df_data_on_activities$steps.x), remade_to_df_data_on_activities$steps.y, remade_to_df_data_on_activities$steps.x)
imputed_aggregated_data_steps_per_day <- aggregate(data=remade_to_df_data_on_activities,steps~date,sum)
hist2 = hist(imputed_aggregated_data_steps_per_day$steps,
main="Total Steps Per Day On Imputed Data",
xlab="Total Steps Taken Per Day",
ylab="Number Of Days With This Pattern",
col="darkgreen",
ylim=range(0:45))
hist2
## $breaks
## [1] 0 5000 10000 15000 20000 25000
##
## $counts
## [1] 5 12 36 6 2
##
## $density
## [1] 1.639344e-05 3.934426e-05 1.180328e-04 1.967213e-05 6.557377e-06
##
## $mids
## [1] 2500 7500 12500 17500 22500
##
## $xname
## [1] "imputed_aggregated_data_steps_per_day$steps"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
png("C:\\Users\\Alex\\Documents\\R\\HOMEPROJECTS\\Reproducible_Research_1\\figure\\directory\\figure_4.png", width=480, height=480)
hist(imputed_aggregated_data_steps_per_day$steps,
main="Total Steps Per Day On Imputed Data",
xlab="Total Steps Taken Per Day",
ylab="Number Of Days With This Pattern",
col="darkgreen",
ylim=range(0:45))
dev.off()
## png
## 2
summary(imputed_aggregated_data_steps_per_day$steps) #There is mean and median
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41 9819 10766 10766 12811 21194
# Another way:
mn_2 <- round(mean(imputed_aggregated_data_steps_per_day$steps),digits=2)
md_2 <- round(median(imputed_aggregated_data_steps_per_day$steps),digits=2)
mn_2
## [1] 10766.19
md_2
## [1] 10766.19
### Differences from the first part of the assignment:
mn1
## [1] 10766.19
md1
## [1] 10765
diff_mean = mn1 - mn_2
diff_median = md1 - md_2
diff_mean #Mean is the same
## [1] 0
diff_median #Median is a bit different (-1.19)
## [1] -1.19
# Other ways for doing same are made above (for the non-imputed data)
The difference in the MEAN steps per day with imputed data is: 0 The difference in the MEDIAN steps per day with imputed data is: -1.19000000000051
system("defaults write org.R-project.R force.LANG en_US.UTF-8")
## Warning in system("defaults write org.R-project.R force.LANG en_US.UTF-8"):
## 'defaults' not found
## [1] 127
Sys.setlocale("LC_ALL","English") #to be sure the names in the DS will be in English
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
remade_to_df_data_on_activities_3 <- data.frame(remade_to_df_data_on_activities, dow = weekdays(as.Date(remade_to_df_data_on_activities$date)))
remade_to_df_data_on_activities_3$dow <- as.factor(remade_to_df_data_on_activities_3$dow)
summary(remade_to_df_data_on_activities_3$dow)
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 2592 2592 2304 2304 2592 2592 2592
remade_to_df_data_on_activities_3$dow_w_nd <- ifelse(remade_to_df_data_on_activities_3$dow %in% c("Sunday","Saturday"), "weekend","weekday")
### Other method:
remade_to_df_data_on_activities_3 <- mutate(remade_to_df_data_on_activities_3, dow_cat = ifelse(remade_to_df_data_on_activities_3$dow %in% c("Sunday","Saturday"), "weekend","weekday"))
remade_to_df_data_on_activities_3$dow_cat <- as.factor(remade_to_df_data_on_activities_3$dow_cat)
remade_to_df_data_on_activities_3$interval <- as.factor(remade_to_df_data_on_activities_3$interval)
weekend_vs_weekdays_steps_intevals <- aggregate(data=remade_to_df_data_on_activities_3,steps~dow_cat+interval,mean)
weekend_vs_weekdays_steps_intevals$interval <- as.character(weekend_vs_weekdays_steps_intevals$interval)
final_plot = xyplot(steps~interval|factor(dow_cat),
data = weekend_vs_weekdays_steps_intevals,
type='l',
layout=c(1,2),
xlab='Interval',
ylab='Number of Steps',
# OX = from -100 to 2500; bin size 500 (step on the chart)
xlim=seq(-100,2500,500)) #The plot is the copy of the plot requested in the task
final_plot
png("C:\\Users\\Alex\\Documents\\R\\HOMEPROJECTS\\Reproducible_Research_1\\figure\\directory\\figure_5.png", width=480, height=480)
final_plot
dev.off()
## png
## 2
Note that the echo = TRUE
parameter was added to the code chunk to allow printing of the R code that generated the plot.