The purpose of this project is to show the ability to collect, work with, and clean a data set. The goal is to prepare tidy data that can be used for later analysis.
The data used in this project represent data collected from the accelerometers of Samsung Galaxy S smartphones. For a full description please visit the following website:
http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones 1
Here are the data for the project:
https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
The script does the following:
# Load libraries
library(data.table)
library(dplyr)
# Read Dataset
subject_test <- fread("./UCI HAR Dataset/test/subject_test.txt")
y_test <- fread("./UCI HAR Dataset/test/y_test.txt")
x_test <- fread("./UCI HAR Dataset/test/X_test.txt")
subject_train <- fread("./UCI HAR Dataset/train/subject_train.txt")
y_train <- fread("./UCI HAR Dataset/train/y_train.txt")
x_train <- fread("./UCI HAR Dataset/train/X_train.txt")
# Extract features and activity names
cnames <- fread("./UCI HAR Dataset/features.txt")
lActivities <- fread("./UCI HAR Dataset/activity_labels.txt")
# Label the data sets with descriptive variable names
colnames(y_test) <- "activity"
colnames(subject_test) <- "subject"
colnames(x_test) <- as.character(cnames$V2)
all_test <- cbind(subject_test, y_test, x_test)
colnames(y_train) <- "activity"
colnames(subject_train) <- "subject"
colnames(x_train) <- as.character(cnames$V2)
all_train <- cbind(subject_train, y_train, x_train)
# Merge the training and the tests sets to create one data set
all_data <- rbind(all_test, all_train)
paste0("Number of variables: ", dim(all_data)[2])
## [1] "Number of variables: 563"
paste0("Number of Observations: ", dim(all_data)[1])
## [1] "Number of Observations: 10299"
# Coerce column names to obtain syntactically valid ones
valid_names <- make.names(names=names(all_data), unique=TRUE, allow_ = TRUE)
names(all_data) <- valid_names
# Extract only the measurements on the mean and standard deviation for each measurement
sel_data <- select(all_data, matches("subject|activity|\\.mean\\.|\\.std\\."))
names(sel_data) <- gsub(names(sel_data), pattern = "\\.\\.", replacement = "")
# Use descriptive activity names to name the activities in the selected data set
sel_data$activity <- lActivities$V2[match(sel_data$activity, lActivities$V1)]
paste0("The recorded activities are: ", paste(unique(sel_data$activity), collapse = ", "))
## [1] "The recorded activities are: STANDING, SITTING, LAYING, WALKING, WALKING_DOWNSTAIRS, WALKING_UPSTAIRS"
write.table(sel_data, file = "sel_data.txt", row.names = FALSE)
# From the previous data set, create a second independent tidy data set with the average
# of each variable for each activity and each subject
mean_data <- sel_data %>% group_by(subject, activity) %>% summarise_all(funs(mean))
write.table(mean_data, file = "mean_data.txt", row.names = FALSE)
# Re-initialize the Global Environment
rm(list = ls())
[1]: Davide Anguita, Alessandro Ghio, Luca Oneto, Xavier Parra and Jorge L. Reyes-Ortiz. Human Activity Recognition on Smartphones using a Multiclass Hardware-Friendly Support Vector Machine. International Workshop of Ambient Assisted Living (IWAAL 2012). Vitoria-Gasteiz, Spain. Dec 2012