##
## Student: Al Shain
## Email: al_shain@me.com
## Course Project Assignment: Getting and Cleaning Data
## Description:
## The purpose of this project is to demonstrate your ability to collect, work
## with, and clean a data set. The goal is to prepare tidy data that can be used
## for later analysis. You will be graded by your peers on a series of yes/no
## questions related to the project. You will be required to submit: 1) a tidy
## data set as described below, 2) a link to a Github repository with your
## script for performing the analysis, and 3) a code book that describes the
## variables, the data, and any transformations or work that you performed to
## clean up the data called CodeBook.md. You should also include a README.md in
## the repo with your scripts. This repo explains how all of the scripts work
## and how they are connected.
##
## One of the most exciting areas in all of data science right now is wearable
## computing - see for example this article . Companies like Fitbit, Nike, and
## Jawbone Up are racing to develop the most advanced algorithms to attract new
## users. The data linked to from the course website represent data collected
## from the accelerometers from the Samsung Galaxy S smartphone. A full
## description is available at the site where the data was obtained:
##
## http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones
##
## Here are the data for the project:
##
## https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip
##
## set workding directory to project
setwd("~/Desktop/Coursera/GettingAndCleansingData/Project")
## make sure the library "reshape2" is loaded
library(reshape2)
## set the download, URL, and unzip file name
downloadFile <- "data/getdata_dataset.zip"
## download and unzip the filename
downloadURL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
## set variables for URL, file and download locations
trainXFile <- "./data/UCI HAR Dataset/train/X_train.txt"
trainLabels <- "./data/UCI HAR Dataset/train/y_train.txt"
trainSubjectFile <- ".data/UCI HAR Dataset/train/subject_train.txt"
testXFile <- "./data/UCI HAR Dataset/test/X_test.txt"
testLabels <- "./data/UCI HAR Dataset/test/y_test.txt"
testSubjectFile <- ".data/UCI HAR Dataset/test/subject_test.txt"
## test for data foloder and zip file, if NOT found create
if(!file.exists("./data")) { dir.create("./data")}
if (!file.exists(downloadFile)) {
download.file(downloadURL, downloadFile, method = "curl");
unzip(downloadFile, overwrite = T, exdir = ".")
}
## Load activity labels - Uses descriptive activity names to name the activities in the data set
activityLabels <- read.table("./data/UCI HAR Dataset/activity_labels.txt")
activityLabels[,2] <- as.character(activityLabels[,2])
features <- read.table("./data/UCI HAR Dataset/features.txt")
features[,2] <- as.character(features[,2])
## Extract only the data on mean and standard deviation
featuresWanted <- grep(".*mean.*|.*std.*", features[,2])
featuresWanted.names <- features[featuresWanted,2]
featuresWanted.names = gsub('-mean', 'Mean', featuresWanted.names)
featuresWanted.names = gsub('-std', 'Std', featuresWanted.names)
featuresWanted.names <- gsub('[-()]', '', featuresWanted.names)
## Load the data sets
train <- read.table("./data/UCI HAR Dataset/train/X_train.txt")[featuresWanted]
trainActivities <- read.table("./data/UCI HAR Dataset/train/Y_train.txt")
trainSubjects <- read.table("./data/UCI HAR Dataset/train/subject_train.txt")
train <- cbind(trainSubjects, trainActivities, train)
test <- read.table("./data/UCI HAR Dataset/test/X_test.txt")[featuresWanted]
testActivities <- read.table("./data/UCI HAR Dataset/test/Y_test.txt")
testSubjects <- read.table("./data/UCI HAR Dataset/test/subject_test.txt")
test <- cbind(testSubjects, testActivities, test)
## merge train and test data sets and add thier labels
combinedData <- rbind(train, test)
colnames(combinedData) <- c("subject", "activity", featuresWanted.names)
## turn activities & subjects into factors
combinedData$activity <- factor(combinedData$activity, levels = activityLabels[,1], labels = activityLabels[,2])
combinedData$subject <- as.factor(combinedData$subject)
combinedData.melted <- melt(combinedData, id = c("subject", "activity"))
combinedData.mean <- dcast(combinedData.melted, subject + activity ~ variable, mean)
## Write out the tidy data set
write.table(combinedData.mean, "tidy.txt", row.names=FALSE, quote=FALSE)