Topic: US travel behavior
Dataset background: Federal Highway Administration’s National Household Travel Survey
The dataset was selected from the 2024.02.07 edition of the “data is plural” newsletter linked here: https://www.data-is-plural.com/archive/2024-02-07-edition/Links to an external site.
This is a real-world dataset collected by the Federal Highway Administration. US travel behavior has been monitored by the Federal Highway Administration’s National Household Travel Survey since 1968, conducted every five to eight years. This survey is considered the definitive source for understanding the travel habits of the American public. Respondents are asked to log all their household trips within a 24-hour period. The latest survey, conducted in 2022, collected data on approximately 31,000 trips made by around 17,000 individuals in about 8,000 households. It provides detailed information on each trip’s duration, vehicle specifics, purpose, parking expenses, traveler demographics, and other relevant factors.
# Open dataset
# Read main dataset -
setwd("F:\\a_Harrisburg_University_Academics\\ANLY 512-51- A-2024Summer - Data Visualization\\Assignment 6 - Storytelling with Data Data Exploration\\Travel sv")
library(readr)
## Warning: package 'readr' was built under R version 4.2.3
trip <- read_csv("tripv2pub.csv")
## Rows: 31074 Columns: 85
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (61): PERSONID, TRIPID, SEQ_TRIPID, FRSTHM, PARK, HHMEMDRV, TDWKND, TRAV...
## dbl (24): HOUSEID, VEHCASEID, DWELTIME, TRVLCMIN, NUMONTRP, ONTD_P10, NONHHC...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
vehicle <- read_csv("vehv2pub.csv")
## Rows: 14684 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (47): HOUSEID, VEHID, VEHYEAR, MAKE, HHVEHCNT, VEHTYPE, VEHFUEL, VEHCOMM...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
idt <- read_csv("ldtv2pub.csv")
## Rows: 16997 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (42): PERSONID, MAINMODE, INT_FLAG, ONTP_P1, ONTP_P2, ONTP_P3, ONTP_P4, ...
## dbl (24): HOUSEID, LONGDIST, LD_NUMONTRP, ONTP_P9, ONTP_P10, LD_AMT, LD_ICB,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
person <- read_csv("perv2pub.csv")
## Rows: 16997 Columns: 129
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (95): PERSONID, R_SEX, R_RELAT, WORKER, DRIVER, R_RACE, OUTOFTWN, USEPUB...
## dbl (34): HOUSEID, WTPERFIN, WTPERFIN5D, WTPERFIN2D, R_AGE, GCDWORK, TAXISER...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
household <- read_csv("hhv2pub.csv")
## Rows: 7893 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): HOMEOWN, HOMETYPE, RAIL, CENSUS_D, CENSUS_R, HH_HISP, FLAG100, HHF...
## dbl (16): HOUSEID, WTHHFIN, WTHHFIN5D, WTHHFIN2D, NUMADLT, DRVRCNT, CNTTDHH,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hh_df <- household
trip_df <- trip
idt_df <- idt
per_df <- person
veh_df <- vehicle
# Convert all features to numeric
hh_df[] <- lapply(hh_df, function(x) as.numeric(as.character(x)))
trip_df[] <- lapply(trip_df, function(x) as.numeric(as.character(x)))
idt_df[] <- lapply(idt_df, function(x) as.numeric(as.character(x)))
per_df[] <- lapply(per_df, function(x) as.numeric(as.character(x)))
veh_df[] <- lapply(veh_df, function(x) as.numeric(as.character(x)))
# Print the modified data frame
# print(hh_df)
# Checking missing values / NAs in adata
library(VIM,quietly =T)
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(hh_df,numbers =T)
aggr(trip_df,numbers =T)
aggr(idt_df,numbers =T)
aggr(per_df,numbers =T)
aggr(veh_df,numbers =T)
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(hh_df) , tl.cex = 0.5)
corrplot(cor(trip_df), tl.cex = 0.3)
corrplot(cor(idt_df) , tl.cex = 0.4)
corrplot(cor(per_df) , tl.cex = 0.2)
corrplot(cor(veh_df) , tl.cex = 0.4)
Correlation matrix top correlation coefficients:
# Compute the correlation matrix
cor_matrix <- cor(per_df, use = "complete.obs")
# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA
# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)
# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]
cor_df_per_df <- cor_df
# Display the top 10 correlations
head(cor_df_per_df, 30)
## Var1 Var2 Freq
## 1817 R_RACE R_RACE_IMP 0.9999759
## 14687 CENSUS_D CDIVMSAR 0.9993838
## 16497 CDIVMSAR STRATUMID 0.9965903
## 5070 USAGE2_2 USAGE2_3 0.9964627
## 16493 CENSUS_D STRATUMID 0.9960621
## 11179 CONDPUB CONDSHARE 0.9957642
## 11050 CONDPUB CONDSPEC 0.9954601
## 11180 CONDSPEC CONDSHARE 0.9953442
## 10139 W_VISIMP W_CHAIR 0.9953354
## 10010 W_VISIMP W_SCCH 0.9951219
## 5329 USAGE2_3 USAGE2_5 0.9948240
## 11437 CONDPUB CONDRF 0.9946129
## 11439 CONDSHARE CONDRF 0.9944988
## 5200 USAGE2_3 USAGE2_4 0.9943278
## 11438 CONDSPEC CONDRF 0.9941954
## 5328 USAGE2_2 USAGE2_5 0.9936316
## 5330 USAGE2_4 USAGE2_5 0.9934460
## 5199 USAGE2_2 USAGE2_4 0.9933281
## 6330 WORKER PAYPROF 0.9928630
## 10140 W_SCCH W_CHAIR 0.9920287
## 9880 W_WKCR W_VISIMP 0.9903802
## 5458 USAGE2_3 USAGE2_6 0.9894974
## 10138 W_WKCR W_CHAIR 0.9892183
## 5460 USAGE2_5 USAGE2_6 0.9887864
## 10009 W_WKCR W_SCCH 0.9887686
## 5459 USAGE2_4 USAGE2_6 0.9885675
## 11440 CONDNONE CONDRF 0.9883024
## 5457 USAGE2_2 USAGE2_6 0.9883020
## 11049 CONDRIVE CONDSPEC 0.9882522
## 11308 CONDPUB CONDNONE 0.9879304
# Compute the correlation matrix
cor_matrix <- cor(hh_df, use = "complete.obs")
# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA
# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)
# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]
cor_df_hh_df <- cor_df
# Display the top 10 correlations
head(cor_df_hh_df, 30)
## Var1 Var2 Freq
## 464 CENSUS_D CDIVMSAR 0.9993899
## 1204 CDIVMSAR STRATUMID 0.9967489
## 1199 CENSUS_D STRATUMID 0.9962130
## 971 URBAN URBRUR 0.9727367
## 465 CENSUS_R CDIVMSAR 0.9568406
## 324 CENSUS_D CENSUS_R 0.9567655
## 1200 CENSUS_R STRATUMID 0.9535959
## 1069 HHSIZE RESP_CNT 0.9488492
## 576 HHFAMINC HHFAMINC_IMP 0.8608143
## 1113 URBRUR URBRUR_2010 0.8299659
## 390 NUMADLT DRVRCNT 0.8184127
## 1111 URBAN URBRUR_2010 0.8182858
## 778 RAIL MSACAT 0.7694530
## 1079 PPT517 RESP_CNT 0.7553198
## 1055 NUMADLT RESP_CNT 0.7452636
## 999 HHSIZE PPT517 0.7317740
## 635 NUMADLT HHSIZE 0.7059614
## 72 WTHHFIN WTHHFIN5D 0.6967124
## 1062 DRVRCNT RESP_CNT 0.6801249
## 705 NUMADLT HHRELATD -0.6705962
## 642 DRVRCNT HHSIZE 0.6447699
## 936 URBAN URBANSIZE 0.6334098
## 1071 HHRELATD RESP_CNT -0.6284769
## 677 DRVRCNT HHVEHCNT 0.6277426
## 712 DRVRCNT HHRELATD -0.6236388
## 719 HHSIZE HHRELATD -0.6184572
## 972 URBANSIZE URBRUR 0.5500704
## 828 MSACAT MSASIZE -0.5296170
## 1121 HOUSEID TDAYDATE 0.5077024
## 216 HOMEOWN HOMETYPE 0.4954214
# Compute the correlation matrix
cor_matrix <- cor(veh_df, use = "complete.obs")
# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA
# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)
# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]
cor_df_veh_df <- cor_df
# Display the top 10 correlations
head(cor_df_veh_df, 30)
## Var1 Var2 Freq
## 800 HOUSEID VEHCASEID 1.0000000
## 943 VEHYEAR VEHAGE -0.9996603
## 1389 CENSUS_D CDIVMSAR 0.9993866
## 2004 CDIVMSAR STRATUMID 0.9963053
## 2000 CENSUS_D STRATUMID 0.9956682
## 1871 URBAN URBRUR 0.9760055
## 1248 CENSUS_D CENSUS_R 0.9571119
## 1390 CENSUS_R CDIVMSAR 0.9569795
## 2001 CENSUS_R STRATUMID 0.9531951
## 432 VEHCOM_RS VEHCOM_DEL 0.9190736
## 1339 NUMADLT DRVRCNT 0.8761052
## 2193 HHFAMINC HHFAMINC_IMP 0.8307284
## 1623 RAIL MSACAT 0.7535177
## 1527 NUMADLT HHSIZE 0.6952301
## 1533 DRVRCNT HHSIZE 0.6906989
## 2112 WTHHFIN WTHHFIN5D 0.6778495
## 190 VEHID HHVEHCNT 0.6726851
## 1824 URBAN URBANSIZE 0.6613027
## 670 COMMERCIALFREQ HHVEHUSETIME_OTH 0.6378659
## 479 VEHCOM_RS VEHCOM_OTH 0.6152079
## 480 VEHCOM_DEL VEHCOM_OTH 0.6081051
## 1872 URBANSIZE URBRUR 0.5831634
## 1003 VEHOWNED VEHOWNMO 0.5724584
## 1956 DRVRCNT WRKCOUNT 0.5293307
## 1898 VEHCASEID TDAYDATE 0.5150411
## 1881 HOUSEID TDAYDATE 0.5150411
## 1950 NUMADLT WRKCOUNT 0.4836780
## 1680 MSACAT MSASIZE -0.4829830
## 1960 HHSIZE WRKCOUNT 0.4788773
## 1774 MSACAT URBAN 0.4357032
# Compute the correlation matrix
cor_matrix <- cor(trip_df, use = "complete.obs")
# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA
# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)
# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]
cor_df_trip_df <- cor_df
# Display the top 10 correlations
head(cor_df_trip_df, 30)
## Var1 Var2 Freq
## 4081 HOUSEID TDCASEID 1.0000000
## 5071 CENSUS_D CDIVMSAR 0.9994084
## 258 TRIPID SEQ_TRIPID 0.9987767
## 6180 CDIVMSAR STRATUMID 0.9968642
## 6176 CENSUS_D STRATUMID 0.9964242
## 5933 URBAN URBRUR 0.9712287
## 2828 NUMONTRP NONHHCNT 0.9666089
## 6722 PARK PROXY -0.9661013
## 1548 STRTTIME ENDTIME 0.9640211
## 5072 CENSUS_R CDIVMSAR 0.9571278
## 4816 CENSUS_D CENSUS_R 0.9569459
## 6177 CENSUS_R STRATUMID 0.9541042
## 3266 WHODROVE WHODROVE_IMP 0.9105386
## 7201 HHFAMINC HHFAMINC_IMP 0.8531200
## 3167 TRPTRANS PSGR_FLG -0.8383240
## 5297 ONTD_P4 HHSIZE 0.8329343
## 6688 HH_HISP R_HISP 0.8204921
## 3150 VEHCASEID PSGR_FLG 0.8146073
## 4983 NUMADLT DRVRCNT 0.8120556
## 1705 VEHCASEID VEHID 0.8077582
## 3082 TRPTRANS DRVR_FLG -0.8034127
## 5296 ONTD_P3 HHSIZE 0.8007549
## 3956 WTTRDFIN WTTRDFIN5D 0.7855270
## 6607 HH_RACE R_RACE 0.7807626
## 1790 VEHCASEID TRPTRANS -0.7804329
## 5580 RAIL MSACAT 0.7679511
## 1793 HHMEMDRV TRPTRANS -0.7616162
## 7060 VEHCASEID VEHTYPE 0.7400979
## 3153 HHMEMDRV PSGR_FLG 0.7378071
## 5298 ONTD_P5 HHSIZE 0.7329522
# Compute the correlation matrix
cor_matrix <- cor(idt_df, use = "complete.obs")
# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA
# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)
# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]
cor_df_idt_df <- cor_df
# Display the top 10 correlations
head(cor_df_idt_df, 30)
## Var1 Var2 Freq
## 2543 CENSUS_D CDIVMSAR 0.9993838
## 3537 CDIVMSAR STRATUMID 0.9965903
## 3533 CENSUS_D STRATUMID 0.9960621
## 1005 ONTP_P9 ONTP_P10 0.9891604
## 1606 ENDTRIP MRT_DATE 0.9853262
## 1605 BEGTRIP MRT_DATE 0.9849668
## 938 ONTP_P8 ONTP_P9 0.9796733
## 3349 URBAN URBRUR 0.9728155
## 1407 BEGTRIP ENDTRIP 0.9699927
## 1004 ONTP_P8 ONTP_P10 0.9693147
## 871 ONTP_P7 ONTP_P8 0.9604550
## 2544 CENSUS_R CDIVMSAR 0.9564646
## 2345 CENSUS_D CENSUS_R 0.9563994
## 1742 FARCDIV FARCREG 0.9561699
## 3534 CENSUS_R STRATUMID 0.9531397
## 937 ONTP_P7 ONTP_P9 0.9444249
## 1003 ONTP_P7 ONTP_P10 0.9344946
## 1541 NTSAWAY WEEKEND 0.9026099
## 804 ONTP_P6 ONTP_P7 0.8975582
## 2007 FARCREG GCD_FLAG 0.8828185
## 870 ONTP_P6 ONTP_P8 0.8657692
## 2680 HHFAMINC HHFAMINC_IMP 0.8532737
## 936 ONTP_P6 ONTP_P9 0.8483150
## 1002 ONTP_P6 ONTP_P10 0.8389992
## 1539 BEGTRIP WEEKEND 0.8215350
## 1540 ENDTRIP WEEKEND 0.8210699
## 4068 HH_RACE R_RACE 0.8126424
## 4129 HH_HISP R_HISP 0.8088318
## 1608 WEEKEND MRT_DATE 0.8023114
## 737 ONTP_P5 ONTP_P6 0.7951024
# Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(patternplot)
## Warning: package 'patternplot' was built under R version 4.2.3
##
## Attaching package: 'patternplot'
## The following object is masked from 'package:grid':
##
## pattern
# library(ggpattern)
# Plot 3: Boxplot of HHVEHCNT by URBAN
dataset <- veh_df
# Convert necessary columns to factors
dataset$URBAN <- factor(dataset$URBAN,
levels = c(1, 2, 3, 4),
labels = c("Urban area",
"Urban cluster",
"Area surrounded \n by urban areas",
"Not in \n urban area"))
# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset, aes(x = URBAN, y = HHVEHCNT, fill = URBAN)) +
geom_boxplot(alpha = 0.5) +
theme_minimal() +
scale_fill_manual(values = c("Urban area" = "#1f77b4",
"Urban cluster" = "#ff7f0e",
"Area surrounded \n by urban areas" = "#2ca02c",
"Not in \n urban area" = "#d62728")) +
scale_y_continuous(limits= c(0,8), breaks = seq(2, max(dataset$HHVEHCNT, na.rm = TRUE), by = 1)) +
labs(
title = 'Boxplot of Number of Vehicles in Household by Urban Area',
fill = "Area Type",
x = 'Urban Area',
y = 'Number of Vehicles in Household'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 73 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Create the boxplot of Urban area vs Vehicle year
ggplot(dataset, aes(x = URBAN, y = VEHYEAR, fill = URBAN)) +
geom_boxplot(alpha = 0.5) +
theme_minimal() +
scale_fill_manual(values = c("Urban area" = "#1f77b4",
"Urban cluster" = "#ff7f0e",
"Area surrounded \n by urban areas" = "#2ca02c",
"Not in \n urban area" = "#d62728")) +
scale_y_continuous(limits= c(1980,2025), breaks = seq(min(dataset$VEHYEAR, na.rm = TRUE), max(dataset$VEHYEAR, na.rm = TRUE), by = 4)) +
labs(
title = 'Boxplot of Vehicle Year by Urban Area',
fill = "Area Type",
x = 'Urban Area',
y = 'Vehicle Year'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
# Plot 3: Boxplot of HHVEHCNT by URBAN
dataset = 0
dataset <- trip_df
# Convert necessary columns to factors
dataset <- subset(dataset, TRPHHVEH != -1) # remove level -1
dataset$TRPHHVEH <- factor(dataset$TRPHHVEH,
levels = c(1, 2),
labels = c(
"Household Vehicle",
"Other Vehicle"
))
# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset, aes(x = TRPHHVEH, y = TRVLCMIN, fill = TRPHHVEH)) +
geom_boxplot(alpha = 0.5) +
theme_minimal() +
scale_fill_manual(values = c("Household Vehicle" = "#ff7f0e",
"Other Vehicle" = "#2ca02c")) +
scale_y_continuous(limits= c(0,100), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
labs(
title = 'Boxplot of Vehicles by Trip duration',
fill = "Type",
x = 'Vehicles Type',
y = 'Trip duration (mins)'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 594 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Create the boxplot of Urban area vs Vehicle year
# Convert necessary columns to factors
dataset = 0
dataset <- trip_df
dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1
dataset$TRPHHVEH <- factor(dataset$TRPHHVEH,
levels = c(1, 2),
labels = c(
"Household Vehicle",
"Other Vehicle"
))
ggplot(dataset, aes(x = VMT_MILE, y = TRVLCMIN, color = TRPHHVEH, shape = TRPHHVEH)) +
geom_point(data = subset(dataset, TRPHHVEH == "Household Vehicle"), alpha = 0.6, size = 2, fill = NA) +
geom_point(data = subset(dataset, TRPHHVEH == "Other Vehicle"), alpha = 0.8, size = 2) +
theme_minimal() +
scale_color_manual(values = c("Household Vehicle" = "#ff7f0e", "Other Vehicle" = "#2ca02c")) +
scale_shape_manual(values = c("Household Vehicle" = 21, "Other Vehicle" = 4)) + # 21 for hollow circle, 4 for cross
scale_x_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$VMT_MILE, na.rm = TRUE), by = 10)) +
scale_y_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
labs(
title = 'Scatter Plot of Vehicles by Trip Duration',
color = "Vehicle Type",
shape = "Vehicle Type",
x = 'Trip Miles (miles)',
y = 'Trip Duration (mins)'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 135 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Create the boxplot
# Convert necessary columns to factors
dataset = 0
dataset <- trip_df
dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1
dataset <- subset(dataset, WORKER != -1) # remove level -1
dataset$WORKER <- factor(dataset$WORKER,
levels = c(1, 2),
labels = c(
"Worker",
"Not Worker"
))
ggplot(dataset, aes(x = VMT_MILE, y = TRVLCMIN, color = WORKER, shape = WORKER)) +
geom_point(data = subset(dataset, WORKER == "Worker"), alpha = 0.6, size = 2, fill = NA) +
geom_point(data = subset(dataset, WORKER == "Not Worker"), alpha = 0.6, size = 2) +
theme_minimal() +
scale_color_manual(values = c("Worker" = "#ff7f0e", "Not Worker" = "#2ca02c")) +
scale_shape_manual(values = c("Worker" = 21, "Not Worker" = 4)) + # 21 for hollow circle, 4 for cross
scale_x_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$VMT_MILE, na.rm = TRUE), by = 10)) +
scale_y_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
labs(
title = 'Scatter Plot of Employement Status by Trip Duration',
color = "Employement Status",
shape = "Employement Status",
x = 'Trip Miles (miles)',
y = 'Trip Duration (mins)'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 94 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 54 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Create the boxplot of Urban area vs Travel Day
# Convert necessary columns to factors
dataset = 0
dataset <- trip_df
dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1
dataset <- subset(dataset, WORKER != -1) # remove level -1
dataset$WORKER <- factor(dataset$WORKER,
levels = c(1, 2),
labels = c(
"Worker",
"Not Worker"
))
# Function to remove outliers
# remove_outliers <- function(data, column) {
# Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
# Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
# IQR <- Q3 - Q1
# data[which(data[[column]] >= (Q1 - 1.5 * IQR) & data[[column]] <= (Q3 + 1.5 * IQR)), ]
# }
# Remove outliers from dataset
# dataset_filtered <- remove_outliers(dataset, "VMT_MILE")
dataset_filtered <- dataset
# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset_filtered, aes(x = WORKER, y = VMT_MILE, fill = WORKER)) +
geom_boxplot(alpha = 0.5) +
theme_minimal() +
scale_fill_manual(values = c(
"Worker" = "#00FFFF",
"Not Worker"="#FF7F00"
)) +
scale_y_continuous(limits= c(0,40), breaks = seq(0, max(dataset_filtered$VMT_MILE, na.rm = TRUE), by = 5)) +
labs(
title = 'Employement Status by Trip Distance',
fill = "Employement Status",
x = 'Employement Status',
y = 'Trip distance (miles)'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 826 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Create the boxplot of Urban area vs Travel Day
# Convert necessary columns to factors
dataset = 0
dataset <- trip_df
dataset$TRAVDAY <- factor(dataset$TRAVDAY,
levels = c(1, 2, 3, 4, 5, 6, 7),
labels = c(
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday"
))
# Function to remove outliers
remove_outliers <- function(data, column) {
Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
data[which(data[[column]] >= (Q1 - 1.5 * IQR) & data[[column]] <= (Q3 + 1.5 * IQR)), ]
}
# Remove outliers from dataset
dataset_filtered <- remove_outliers(dataset, "VMT_MILE")
# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset_filtered, aes(x = TRAVDAY, y = VMT_MILE, fill = TRAVDAY)) +
geom_boxplot(alpha = 0.5) +
theme_minimal() +
scale_fill_manual(values = c( "Sunday"= "#0000FF",
"Monday"= "#007FFF",
"Tuesday" = "#00FFFF",
"Wednesday"= "#7FFF7F",
"Thursday"= "#FFFF00",
"Friday"="#FF7F00",
"Saturday"= "#FF0000"
)) +
scale_y_continuous(limits= c(0,10), breaks = seq(0, max(dataset_filtered$VMT_MILE, na.rm = TRUE), by = 2)) +
labs(
title = 'Boxplot of Days by Trip Distance',
fill = "Type",
x = 'Day',
y = 'Trip distance (miles)'
) +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(hjust = 0.5),
axis.title.y = element_text(hjust = 0.5)
)
## Warning: Removed 13772 rows containing non-finite outside the scale range
## (`stat_boxplot()`).