# Load train data
trainData <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/X_train.txt")
trainLabel <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/y_train.txt")
trainSubject <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/subject_train.txt")
# Load test data
testData <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/X_test.txt")
testLabel <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/y_test.txt")
testSubject <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/subject_test.txt")
joinData <- rbind(trainData, testData)
# remove no longer required data sets
remove(trainData); remove(testData)
# Show dimensions of new data table
dim(joinData)
## [1] 10299 561
The train and test labels are also joined and finally the train and test subject identifiers.
joinLabel <- rbind(trainLabel, testLabel)
joinSubject <- rbind(trainSubject, testSubject)
remove(trainLabel); remove(testLabel); remove(trainSubject); remove(testSubject)
head(joinData,5)[,c('V1', 'V2', 'V3', 'V4', 'V5')]
## V1 V2 V3 V4 V5
## 1 0.2885845 -0.02029417 -0.1329051 -0.9952786 -0.9831106
## 2 0.2784188 -0.01641057 -0.1235202 -0.9982453 -0.9753002
## 3 0.2796531 -0.01946716 -0.1134617 -0.9953796 -0.9671870
## 4 0.2791739 -0.02620065 -0.1232826 -0.9960915 -0.9834027
## 5 0.2766288 -0.01656965 -0.1153619 -0.9981386 -0.9808173
features <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/features.txt")
head(features)
## V1 V2
## 1 1 tBodyAcc-mean()-X
## 2 2 tBodyAcc-mean()-Y
## 3 3 tBodyAcc-mean()-Z
## 4 4 tBodyAcc-std()-X
## 5 5 tBodyAcc-std()-Y
## 6 6 tBodyAcc-std()-Z
meanSD <- grep("mean\\(\\)|std\\(\\)", features[, 2])
joinData <- joinData[, meanSD]
dim(joinData)
## [1] 10299 66
names(joinData) <- gsub("\\(\\)", "", features[meanSD, 2]) # remove "()"
names(joinData) <- gsub("mean", "Mean", names(joinData)) # capitalise M
names(joinData) <- gsub("std", "Std", names(joinData)) # change "std" to "SD"
names(joinData) <- gsub("-", "", names(joinData)) # remove "-"
remove(features)
# preview data
head(joinData,5)[,c('tBodyAccMeanX', 'tBodyAccMeanY', 'tBodyAccMeanZ', 'tBodyAccStdX',
'tBodyAccStdY')]
## tBodyAccMeanX tBodyAccMeanY tBodyAccMeanZ tBodyAccStdX tBodyAccStdY
## 1 0.2885845 -0.02029417 -0.1329051 -0.9952786 -0.9831106
## 2 0.2784188 -0.01641057 -0.1235202 -0.9982453 -0.9753002
## 3 0.2796531 -0.01946716 -0.1134617 -0.9953796 -0.9671870
## 4 0.2791739 -0.02620065 -0.1232826 -0.9960915 -0.9834027
## 5 0.2766288 -0.01656965 -0.1153619 -0.9981386 -0.9808173
activity <- read.table("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/activity_labels.txt")
activity
## V1 V2
## 1 1 WALKING
## 2 2 WALKING_UPSTAIRS
## 3 3 WALKING_DOWNSTAIRS
## 4 4 SITTING
## 5 5 STANDING
## 6 6 LAYING
activity[, 2] <- tolower(gsub("_", "", activity[, 2]))
substr(activity[2, 2], 8, 8) <- toupper(substr(activity[2, 2], 8, 8))
substr(activity[3, 2], 8, 8) <- toupper(substr(activity[3, 2], 8, 8))
activityLabel <- activity[joinLabel[, 1], 2]
joinLabel[, 1] <- activityLabel
names(joinLabel) <- "activity"
head(joinLabel)
## activity
## 1 standing
## 2 standing
## 3 standing
## 4 standing
## 5 standing
## 6 standing
Finally the full data will be binded with the subject and activity labels. The activity labels will become the dependent variable for the statistical modelling task in the next section of the project.
names(joinSubject) <- "subject"
cleanData <- cbind(joinSubject, joinLabel, joinData)
remove(joinData); remove(activity)
remove(joinLabel); remove(joinSubject)
remove(activityLabel); remove(meanSD)
head(cleanData,5)[,c('subject', 'activity', 'tBodyAccMeanX', 'tBodyAccMeanY',
'tBodyAccMeanZ')]
## subject activity tBodyAccMeanX tBodyAccMeanY tBodyAccMeanZ
## 1 1 standing 0.2885845 -0.02029417 -0.1329051
## 2 1 standing 0.2784188 -0.01641057 -0.1235202
## 3 1 standing 0.2796531 -0.01946716 -0.1134617
## 4 1 standing 0.2791739 -0.02620065 -0.1232826
## 5 1 standing 0.2766288 -0.01656965 -0.1153619
First the clean data will be re-split into train and test sets using a 70/30 split. The data will be split on subject in order to get an even distribution of activities in each set.
set.seed(123)
# Convert to factors BEFORE splitting (preserves all levels)
cleanData$activity <- factor(cleanData$activity)
cleanData$subject <- factor(cleanData$subject)
# Get unique subjects
unique_subjects <- unique(cleanData$subject)
# Randomly select 70% of subjects for training
train_subjects <- sample(unique_subjects,
size = floor(0.7 * length(unique_subjects)))
# Split data
cleanData.train <- cleanData[cleanData$subject %in% train_subjects, ]
cleanData.test <- cleanData[!cleanData$subject %in% train_subjects, ]
# Verify factor levels match
identical(levels(cleanData.train$activity), levels(cleanData.test$activity))
## [1] TRUE
# Should return TRUE
activity.rf <- randomForest(as.factor(cleanData.train$activity)~., data=cleanData.train)
training.cm <- confusionMatrix(cleanData.train$activity,
predict(activity.rf, cleanData.train, type="class"))
training.cm[3]
## $overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 1.0000000 1.0000000 0.9994805 1.0000000 0.1887590
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
par(mfrow=c(1,1))
varImpPlot(activity.rf, pch=1, main="Random Forest Model Variables Importance")
test.cm <- confusionMatrix(cleanData.test$activity,
predict(activity.rf, cleanData.test,type="class"))
test.cm[3]
## $overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.9118750 0.8939579 0.9015165 0.9214732 0.1928125
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
The accuracy of the random forests model on the test set is 91.15%. This indicates that it is a good model for predicting activities based on data collected from the smart phones.
This project aimed to demonstrate the importance of create a clean and structured data set in order to perform meaningful analysis. There are a number of other important cleaning techniques that were not required as part of this project but should be considered in any data cleaning task. This includes filling in missing values, correcting erroneous values and standardising.
The clean data set allowed statistic analysis to be performed in a few simple steps with meaningful results. Further steps in this analysis could be to assess the random forests model against other statistical methods.
bookings_df <- read_csv("C:/Users/shafi/OneDrive/Desktop/resume project/project 1/data project 1/hotel_bookings.csv")
## Rows: 119390 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): hotel, is_canceled, arrival_date_month, meal, country, market_segm...
## dbl (17): lead_time, arrival_date_year, arrival_date_week_number, arrival_da...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(bookings_df) # Using base R
## # A tibble: 6 × 32
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <chr> <dbl> <dbl> <chr>
## 1 Resort Hotel no 342 2015 July
## 2 Resort Hotel no 737 2015 July
## 3 Resort Hotel no 7 2015 July
## 4 Resort Hotel no 13 2015 July
## 5 Resort Hotel no 14 2015 July
## 6 Resort Hotel no 14 2015 July
## # ℹ 27 more variables: arrival_date_week_number <dbl>,
## # arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
## # stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
## # meal <chr>, country <chr>, market_segment <chr>,
## # distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, …
bookings_df$is_canceled <- ifelse(bookings_df$is_canceled == "yes", 1, 0)
bookings_df$is_canceled <- as.numeric(bookings_df$is_canceled)
head(bookings_df)
## # A tibble: 6 × 32
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## # ℹ 27 more variables: arrival_date_week_number <dbl>,
## # arrival_date_day_of_month <dbl>, stays_in_weekend_nights <dbl>,
## # stays_in_week_nights <dbl>, adults <dbl>, children <dbl>, babies <dbl>,
## # meal <chr>, country <chr>, market_segment <chr>,
## # distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, …
str(bookings_df)
## spc_tbl_ [119,390 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ hotel : chr [1:119390] "Resort Hotel" "Resort Hotel" "Resort Hotel" "Resort Hotel" ...
## $ is_canceled : num [1:119390] 0 0 0 0 0 0 0 0 1 1 ...
## $ lead_time : num [1:119390] 342 737 7 13 14 14 0 9 85 75 ...
## $ arrival_date_year : num [1:119390] 2015 2015 2015 2015 2015 ...
## $ arrival_date_month : chr [1:119390] "July" "July" "July" "July" ...
## $ arrival_date_week_number : num [1:119390] 27 27 27 27 27 27 27 27 27 27 ...
## $ arrival_date_day_of_month : num [1:119390] 1 1 1 1 1 1 1 1 1 1 ...
## $ stays_in_weekend_nights : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ stays_in_week_nights : num [1:119390] 0 0 1 1 2 2 2 2 3 3 ...
## $ adults : num [1:119390] 2 2 1 1 2 2 2 2 2 2 ...
## $ children : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ babies : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ meal : chr [1:119390] "BB" "BB" "BB" "BB" ...
## $ country : chr [1:119390] "PRT" "PRT" "GBR" "GBR" ...
## $ market_segment : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ distribution_channel : chr [1:119390] "Direct" "Direct" "Direct" "Corporate" ...
## $ is_repeated_guest : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_cancellations : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ previous_bookings_not_canceled: num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ reserved_room_type : chr [1:119390] "C" "C" "A" "A" ...
## $ assigned_room_type : chr [1:119390] "C" "C" "C" "A" ...
## $ booking_changes : num [1:119390] 3 4 0 0 0 0 0 0 0 0 ...
## $ deposit_type : chr [1:119390] "No Deposit" "No Deposit" "No Deposit" "No Deposit" ...
## $ agent : chr [1:119390] "NULL" "NULL" "NULL" "304" ...
## $ company : chr [1:119390] "NULL" "NULL" "NULL" "NULL" ...
## $ days_in_waiting_list : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ customer_type : chr [1:119390] "Transient" "Transient" "Transient" "Transient" ...
## $ adr : num [1:119390] 0 0 75 75 98 ...
## $ required_car_parking_spaces : num [1:119390] 0 0 0 0 0 0 0 0 0 0 ...
## $ total_of_special_requests : num [1:119390] 0 0 0 0 1 1 0 1 1 0 ...
## $ reservation_status : chr [1:119390] "Check-Out" "Check-Out" "Check-Out" "Check-Out" ...
## $ reservation_status_date : chr [1:119390] "7/1/2015" "7/1/2015" "7/2/2015" "7/2/2015" ...
## - attr(*, "spec")=
## .. cols(
## .. hotel = col_character(),
## .. is_canceled = col_character(),
## .. lead_time = col_double(),
## .. arrival_date_year = col_double(),
## .. arrival_date_month = col_character(),
## .. arrival_date_week_number = col_double(),
## .. arrival_date_day_of_month = col_double(),
## .. stays_in_weekend_nights = col_double(),
## .. stays_in_week_nights = col_double(),
## .. adults = col_double(),
## .. children = col_double(),
## .. babies = col_double(),
## .. meal = col_character(),
## .. country = col_character(),
## .. market_segment = col_character(),
## .. distribution_channel = col_character(),
## .. is_repeated_guest = col_double(),
## .. previous_cancellations = col_double(),
## .. previous_bookings_not_canceled = col_double(),
## .. reserved_room_type = col_character(),
## .. assigned_room_type = col_character(),
## .. booking_changes = col_double(),
## .. deposit_type = col_character(),
## .. agent = col_character(),
## .. company = col_character(),
## .. days_in_waiting_list = col_double(),
## .. customer_type = col_character(),
## .. adr = col_double(),
## .. required_car_parking_spaces = col_double(),
## .. total_of_special_requests = col_double(),
## .. reservation_status = col_character(),
## .. reservation_status_date = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# You can also use colnames() to check the names of the columns in your data set. Run the code chunk below to find out the column names in this data set:
colnames(bookings_df)
## [1] "hotel" "is_canceled"
## [3] "lead_time" "arrival_date_year"
## [5] "arrival_date_month" "arrival_date_week_number"
## [7] "arrival_date_day_of_month" "stays_in_weekend_nights"
## [9] "stays_in_week_nights" "adults"
## [11] "children" "babies"
## [13] "meal" "country"
## [15] "market_segment" "distribution_channel"
## [17] "is_repeated_guest" "previous_cancellations"
## [19] "previous_bookings_not_canceled" "reserved_room_type"
## [21] "assigned_room_type" "booking_changes"
## [23] "deposit_type" "agent"
## [25] "company" "days_in_waiting_list"
## [27] "customer_type" "adr"
## [29] "required_car_parking_spaces" "total_of_special_requests"
## [31] "reservation_status" "reservation_status_date"
skim_without_charts(bookings_df)
| Name | bookings_df |
| Number of rows | 119390 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 14 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| hotel | 0 | 1 | 10 | 12 | 0 | 2 | 0 |
| arrival_date_month | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
| meal | 0 | 1 | 2 | 9 | 0 | 5 | 0 |
| country | 0 | 1 | 2 | 4 | 0 | 178 | 0 |
| market_segment | 0 | 1 | 6 | 13 | 0 | 8 | 0 |
| distribution_channel | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
| reserved_room_type | 0 | 1 | 1 | 1 | 0 | 10 | 0 |
| assigned_room_type | 0 | 1 | 1 | 1 | 0 | 12 | 0 |
| deposit_type | 0 | 1 | 10 | 10 | 0 | 3 | 0 |
| agent | 0 | 1 | 1 | 4 | 0 | 334 | 0 |
| company | 0 | 1 | 1 | 4 | 0 | 353 | 0 |
| customer_type | 0 | 1 | 5 | 15 | 0 | 4 | 0 |
| reservation_status | 0 | 1 | 7 | 9 | 0 | 3 | 0 |
| reservation_status_date | 0 | 1 | 8 | 10 | 0 | 926 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| is_canceled | 0 | 1 | 0.37 | 0.48 | 0.00 | 0.00 | 0.00 | 1 | 1 |
| lead_time | 0 | 1 | 104.01 | 106.86 | 0.00 | 18.00 | 69.00 | 160 | 737 |
| arrival_date_year | 0 | 1 | 2016.16 | 0.71 | 2015.00 | 2016.00 | 2016.00 | 2017 | 2017 |
| arrival_date_week_number | 0 | 1 | 27.17 | 13.61 | 1.00 | 16.00 | 28.00 | 38 | 53 |
| arrival_date_day_of_month | 0 | 1 | 15.80 | 8.78 | 1.00 | 8.00 | 16.00 | 23 | 31 |
| stays_in_weekend_nights | 0 | 1 | 0.93 | 1.00 | 0.00 | 0.00 | 1.00 | 2 | 19 |
| stays_in_week_nights | 0 | 1 | 2.50 | 1.91 | 0.00 | 1.00 | 2.00 | 3 | 50 |
| adults | 0 | 1 | 1.86 | 0.58 | 0.00 | 2.00 | 2.00 | 2 | 55 |
| children | 4 | 1 | 0.10 | 0.40 | 0.00 | 0.00 | 0.00 | 0 | 10 |
| babies | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0 | 10 |
| is_repeated_guest | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0 | 1 |
| previous_cancellations | 0 | 1 | 0.09 | 0.84 | 0.00 | 0.00 | 0.00 | 0 | 26 |
| previous_bookings_not_canceled | 0 | 1 | 0.14 | 1.50 | 0.00 | 0.00 | 0.00 | 0 | 72 |
| booking_changes | 0 | 1 | 0.22 | 0.65 | 0.00 | 0.00 | 0.00 | 0 | 21 |
| days_in_waiting_list | 0 | 1 | 2.32 | 17.59 | 0.00 | 0.00 | 0.00 | 0 | 391 |
| adr | 0 | 1 | 101.83 | 50.54 | -6.38 | 69.29 | 94.58 | 126 | 5400 |
| required_car_parking_spaces | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0 | 8 |
| total_of_special_requests | 0 | 1 | 0.57 | 0.79 | 0.00 | 0.00 | 0.00 | 1 | 5 |
# Check total missing values
sum(is.na(bookings_df))
## [1] 4
# Check missing per column
colSums(is.na(bookings_df))
## hotel is_canceled
## 0 0
## lead_time arrival_date_year
## 0 0
## arrival_date_month arrival_date_week_number
## 0 0
## arrival_date_day_of_month stays_in_weekend_nights
## 0 0
## stays_in_week_nights adults
## 0 0
## children babies
## 4 0
## meal country
## 0 0
## market_segment distribution_channel
## 0 0
## is_repeated_guest previous_cancellations
## 0 0
## previous_bookings_not_canceled reserved_room_type
## 0 0
## assigned_room_type booking_changes
## 0 0
## deposit_type agent
## 0 0
## company days_in_waiting_list
## 0 0
## customer_type adr
## 0 0
## required_car_parking_spaces total_of_special_requests
## 0 0
## reservation_status reservation_status_date
## 0 0
# Check missing percentage per column
missing_pct <- colSums(is.na(bookings_df)) / nrow(bookings_df) * 100
print(missing_pct)
## hotel is_canceled
## 0.000000000 0.000000000
## lead_time arrival_date_year
## 0.000000000 0.000000000
## arrival_date_month arrival_date_week_number
## 0.000000000 0.000000000
## arrival_date_day_of_month stays_in_weekend_nights
## 0.000000000 0.000000000
## stays_in_week_nights adults
## 0.000000000 0.000000000
## children babies
## 0.003350364 0.000000000
## meal country
## 0.000000000 0.000000000
## market_segment distribution_channel
## 0.000000000 0.000000000
## is_repeated_guest previous_cancellations
## 0.000000000 0.000000000
## previous_bookings_not_canceled reserved_room_type
## 0.000000000 0.000000000
## assigned_room_type booking_changes
## 0.000000000 0.000000000
## deposit_type agent
## 0.000000000 0.000000000
## company days_in_waiting_list
## 0.000000000 0.000000000
## customer_type adr
## 0.000000000 0.000000000
## required_car_parking_spaces total_of_special_requests
## 0.000000000 0.000000000
## reservation_status reservation_status_date
## 0.000000000 0.000000000
# Remove ALL rows with any NA
cleanData_complete <- na.omit(bookings_df)
colSums(is.na(cleanData_complete))
## hotel is_canceled
## 0 0
## lead_time arrival_date_year
## 0 0
## arrival_date_month arrival_date_week_number
## 0 0
## arrival_date_day_of_month stays_in_weekend_nights
## 0 0
## stays_in_week_nights adults
## 0 0
## children babies
## 0 0
## meal country
## 0 0
## market_segment distribution_channel
## 0 0
## is_repeated_guest previous_cancellations
## 0 0
## previous_bookings_not_canceled reserved_room_type
## 0 0
## assigned_room_type booking_changes
## 0 0
## deposit_type agent
## 0 0
## company days_in_waiting_list
## 0 0
## customer_type adr
## 0 0
## required_car_parking_spaces total_of_special_requests
## 0 0
## reservation_status reservation_status_date
## 0 0
trimmed_df <- cleanData_complete %>%
select('hotel','is_canceled','lead_time')
trimmed_df %>%
select(hotel, is_canceled, lead_time) %>%
rename('hotel_type' = hotel)
## # A tibble: 119,386 × 3
## hotel_type is_canceled lead_time
## <chr> <dbl> <dbl>
## 1 Resort Hotel 0 342
## 2 Resort Hotel 0 737
## 3 Resort Hotel 0 7
## 4 Resort Hotel 0 13
## 5 Resort Hotel 0 14
## 6 Resort Hotel 0 14
## 7 Resort Hotel 0 0
## 8 Resort Hotel 0 9
## 9 Resort Hotel 1 85
## 10 Resort Hotel 1 75
## # ℹ 119,376 more rows
example_df <- cleanData_complete %>%
select(arrival_date_year, arrival_date_month) %>%
unite(arrival_month_year, c("arrival_date_month", "arrival_date_year"), sep = " ")