#Setting the working directory to "Temp" to test downloading and unzipping if needed
setwd("/Users/tina/desktop/Temp")
#Installing packages required for the script
# packages <- c("RCurl", "downloader", "plyr", "dplyr", "data.table", "rapport", "tidyr")
packages <- c("RCurl", "downloader", "data.table", "rapport", "tidyr")
sapply(packages, require, character.only=TRUE, quietly=TRUE)
## RCurl downloader data.table rapport tidyr
## TRUE TRUE TRUE TRUE TRUE
#Reading the data in and unzipping it from the internet
#Not needed if explicitly state that the files need to be downloaded to the working directory first
# check to see if the file is there, if not, download it
# if (!file.info("UCI HAR Dataset")$isdir)
dataFile <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
#created directory to put the files into
suppressWarnings(dir.create("UCI HAR Dataset"))
#download the zipped file
download.file(dataFile, "UCI-HAR-Dataset.zip", method ="curl")
unzip("./UCI-HAR-Dataset.zip")
#working out column classes to speed up read.table
initial_test <- read.table("./UCI HAR Dataset/test/X_test.txt", nrows = 100)
test_classes <- sapply(initial_test, class)
initial_subject_id <- read.table("./UCI HAR Dataset/test/X_test.txt", nrows = 100)
test_classes <- sapply(initial_test, class)
#Reading test and train data sets in as tables using colClasses from initial investigation
XtestData <- read.table("./UCI HAR Dataset/test/X_test.txt", quote = "", fill = TRUE, colClasses = test_classes)
XtrainData <- read.table("./UCI HAR Dataset/train/X_train.txt", quote = "", fill = TRUE, colClasses = test_classes)
CombinedTestTrain <- rbind(XtestData, XtrainData)
#CombinedTestTrain is now the required output for part1 of the project :"Merges the training and the test sets to create one data set."
#Subset combined test and train data extracting only those columns that have mean and std measurements in them (keep)
features <- read.table("./UCI HAR Dataset/features.txt")
MeanAndStd <- grep("(.*)[Mm]ean(.*)|(.*)[Ss]td(.*)", features[, 2], perl = TRUE, value = FALSE) #works
MeanAndStdNames <- grep("(.*)[Mm]ean(.*)|(.*)[Ss]td(.*)", features[, 2], perl = TRUE, value = TRUE) #works
# to be used as column names for data set
make.names(MeanAndStdNames)
## [1] "tBodyAcc.mean...X"
## [2] "tBodyAcc.mean...Y"
## [3] "tBodyAcc.mean...Z"
## [4] "tBodyAcc.std...X"
## [5] "tBodyAcc.std...Y"
## [6] "tBodyAcc.std...Z"
## [7] "tGravityAcc.mean...X"
## [8] "tGravityAcc.mean...Y"
## [9] "tGravityAcc.mean...Z"
## [10] "tGravityAcc.std...X"
## [11] "tGravityAcc.std...Y"
## [12] "tGravityAcc.std...Z"
## [13] "tBodyAccJerk.mean...X"
## [14] "tBodyAccJerk.mean...Y"
## [15] "tBodyAccJerk.mean...Z"
## [16] "tBodyAccJerk.std...X"
## [17] "tBodyAccJerk.std...Y"
## [18] "tBodyAccJerk.std...Z"
## [19] "tBodyGyro.mean...X"
## [20] "tBodyGyro.mean...Y"
## [21] "tBodyGyro.mean...Z"
## [22] "tBodyGyro.std...X"
## [23] "tBodyGyro.std...Y"
## [24] "tBodyGyro.std...Z"
## [25] "tBodyGyroJerk.mean...X"
## [26] "tBodyGyroJerk.mean...Y"
## [27] "tBodyGyroJerk.mean...Z"
## [28] "tBodyGyroJerk.std...X"
## [29] "tBodyGyroJerk.std...Y"
## [30] "tBodyGyroJerk.std...Z"
## [31] "tBodyAccMag.mean.."
## [32] "tBodyAccMag.std.."
## [33] "tGravityAccMag.mean.."
## [34] "tGravityAccMag.std.."
## [35] "tBodyAccJerkMag.mean.."
## [36] "tBodyAccJerkMag.std.."
## [37] "tBodyGyroMag.mean.."
## [38] "tBodyGyroMag.std.."
## [39] "tBodyGyroJerkMag.mean.."
## [40] "tBodyGyroJerkMag.std.."
## [41] "fBodyAcc.mean...X"
## [42] "fBodyAcc.mean...Y"
## [43] "fBodyAcc.mean...Z"
## [44] "fBodyAcc.std...X"
## [45] "fBodyAcc.std...Y"
## [46] "fBodyAcc.std...Z"
## [47] "fBodyAcc.meanFreq...X"
## [48] "fBodyAcc.meanFreq...Y"
## [49] "fBodyAcc.meanFreq...Z"
## [50] "fBodyAccJerk.mean...X"
## [51] "fBodyAccJerk.mean...Y"
## [52] "fBodyAccJerk.mean...Z"
## [53] "fBodyAccJerk.std...X"
## [54] "fBodyAccJerk.std...Y"
## [55] "fBodyAccJerk.std...Z"
## [56] "fBodyAccJerk.meanFreq...X"
## [57] "fBodyAccJerk.meanFreq...Y"
## [58] "fBodyAccJerk.meanFreq...Z"
## [59] "fBodyGyro.mean...X"
## [60] "fBodyGyro.mean...Y"
## [61] "fBodyGyro.mean...Z"
## [62] "fBodyGyro.std...X"
## [63] "fBodyGyro.std...Y"
## [64] "fBodyGyro.std...Z"
## [65] "fBodyGyro.meanFreq...X"
## [66] "fBodyGyro.meanFreq...Y"
## [67] "fBodyGyro.meanFreq...Z"
## [68] "fBodyAccMag.mean.."
## [69] "fBodyAccMag.std.."
## [70] "fBodyAccMag.meanFreq.."
## [71] "fBodyBodyAccJerkMag.mean.."
## [72] "fBodyBodyAccJerkMag.std.."
## [73] "fBodyBodyAccJerkMag.meanFreq.."
## [74] "fBodyBodyGyroMag.mean.."
## [75] "fBodyBodyGyroMag.std.."
## [76] "fBodyBodyGyroMag.meanFreq.."
## [77] "fBodyBodyGyroJerkMag.mean.."
## [78] "fBodyBodyGyroJerkMag.std.."
## [79] "fBodyBodyGyroJerkMag.meanFreq.."
## [80] "angle.tBodyAccMean.gravity."
## [81] "angle.tBodyAccJerkMean..gravityMean."
## [82] "angle.tBodyGyroMean.gravityMean."
## [83] "angle.tBodyGyroJerkMean.gravityMean."
## [84] "angle.X.gravityMean."
## [85] "angle.Y.gravityMean."
## [86] "angle.Z.gravityMean."
#Subsetting the combined data set by columns containing mean and std measurements:
DataNeeded <- CombinedTestTrain[, MeanAndStd]
names(DataNeeded) <- make.names(MeanAndStdNames)
#Now all the test and train data is in one dataset, subsetted by the columns containing
#Mean and Std measurements
#The column names are taken directly from the downloaded features.txt file
# Read the subject ID to the datasets and bind them as one ID dataset to maintain order
XtestID <- read.table("./UCI HAR Dataset/test/subject_test.txt", quote = "", fill = TRUE, col.names = "subject_ID")
XtrainID <- read.table("./UCI HAR Dataset/train/subject_train.txt", quote = "", fill = TRUE, col.names = "subject_ID")
IDs <- rbind(XtestID, XtrainID)
# Read the activities and bind them as one dataset to maintain order
XtestActivity <- read.table("./UCI HAR Dataset/test/y_test.txt", quote = "", fill = TRUE, col.names = "activity")
XtrainActivity <- read.table("./UCI HAR Dataset/train/y_train.txt", quote = "", fill = TRUE, col.names = "activity")
activity <- rbind(XtestActivity, XtrainActivity)
activity <- as.data.frame(activity)
# Read the activity lables in and convert them to human readable lables
Activity_Lables <- read.table("./UCI HAR Dataset/activity_labels.txt", header = FALSE, quote = "", fill = TRUE)
Activity_Lables[, 2] <- tolower(as.character(Activity_Lables[,2]))
Activity_Lables <- Activity_Lables[, 2]
ActivityLables <- tocamel(Activity_Lables, delim = "\\_", upper = TRUE, sep = "")
#Bind the activities, IDs and all the data together
AllData <- cbind(activity, IDs, DataNeeded)
# Map the activity name to the activity number in the dataset
AllData[, 1] <- ActivityLables[AllData[, 1]]
#Change variable names to more descriptive ones
#first change all to CamelCase
VariableNames <- tocamel(names(AllData), delim = "\\.|\\_", upper = TRUE, sep = "")
#Substitue abbreviations with full words, try to collapse all this into one call to gsub
VariableNames1 <- gsub("Acc", "Acceleration", VariableNames)
VariableNames2 <- gsub("Mag", "Magnitude", VariableNames1)
VariableNames3 <- gsub("Gyro", "Gyroscope", VariableNames2)
VariableNames4 <- gsub("BodyBody", "Body", VariableNames3)
VariableNames5 <- gsub("Std", "StandardDeviation", VariableNames4)
VariableNames6 <- gsub("Freq", "Frequency", VariableNames5)
#Name the variables as human readable lables to the dataset
names(AllData) <- VariableNames6
# Create a tidy data set containing:
# the average of the observations for each subject and activity
# for each of the variables measured
AveragedAllData <- suppressWarnings(aggregate(AllData, by = list(Activities = AllData$Activity, Subject = AllData$SubjectID), mean))
AveragedAllData2 <- suppressWarnings(AveragedAllData[, !(colnames(AveragedAllData) %in% c("SubjectID", "Activity"))])
#Write the tidy data set named average.txt as a table to the working directory.
write.table(AveragedAllData2, "./average.txt", row.names = FALSE, col.names = names(AveragedAllData2))
#Reading the tidy data set back into R using check.names = FALSE
# check.names = FALSE is to remove the "X." R adds to the column names.
# tidy <- read.table("./average.txt", quote = "", header = TRUE, fill = TRUE, check.names = FALSE)