Data Exploration

Topic: US travel behavior

Dataset background: Federal Highway Administration’s National Household Travel Survey

The dataset was selected from the 2024.02.07 edition of the “data is plural” newsletter linked here: https://www.data-is-plural.com/archive/2024-02-07-edition/Links to an external site.

This is a real-world dataset collected by the Federal Highway Administration. US travel behavior has been monitored by the Federal Highway Administration’s National Household Travel Survey since 1968, conducted every five to eight years. This survey is considered the definitive source for understanding the travel habits of the American public. Respondents are asked to log all their household trips within a 24-hour period. The latest survey, conducted in 2022, collected data on approximately 31,000 trips made by around 17,000 individuals in about 8,000 households. It provides detailed information on each trip’s duration, vehicle specifics, purpose, parking expenses, traveler demographics, and other relevant factors.

# Open dataset 

# Read main dataset - 

setwd("F:\\a_Harrisburg_University_Academics\\ANLY 512-51- A-2024Summer - Data Visualization\\Assignment 6 - Storytelling with Data Data Exploration\\Travel sv")

library(readr)

## Warning: package 'readr' was built under R version 4.2.3

trip <- read_csv("tripv2pub.csv")

## Rows: 31074 Columns: 85
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (61): PERSONID, TRIPID, SEQ_TRIPID, FRSTHM, PARK, HHMEMDRV, TDWKND, TRAV...
## dbl (24): HOUSEID, VEHCASEID, DWELTIME, TRVLCMIN, NUMONTRP, ONTD_P10, NONHHC...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

vehicle <- read_csv("vehv2pub.csv")

## Rows: 14684 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (47): HOUSEID, VEHID, VEHYEAR, MAKE, HHVEHCNT, VEHTYPE, VEHFUEL, VEHCOMM...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

idt <- read_csv("ldtv2pub.csv")

## Rows: 16997 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (42): PERSONID, MAINMODE, INT_FLAG, ONTP_P1, ONTP_P2, ONTP_P3, ONTP_P4, ...
## dbl (24): HOUSEID, LONGDIST, LD_NUMONTRP, ONTP_P9, ONTP_P10, LD_AMT, LD_ICB,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

person <- read_csv("perv2pub.csv")

## Rows: 16997 Columns: 129
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (95): PERSONID, R_SEX, R_RELAT, WORKER, DRIVER, R_RACE, OUTOFTWN, USEPUB...
## dbl (34): HOUSEID, WTPERFIN, WTPERFIN5D, WTPERFIN2D, R_AGE, GCDWORK, TAXISER...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

household <- read_csv("hhv2pub.csv")

## Rows: 7893 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): HOMEOWN, HOMETYPE, RAIL, CENSUS_D, CENSUS_R, HH_HISP, FLAG100, HHF...
## dbl (16): HOUSEID, WTHHFIN, WTHHFIN5D, WTHHFIN2D, NUMADLT, DRVRCNT, CNTTDHH,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

hh_df   <- household
trip_df <- trip
idt_df  <- idt
per_df  <- person
veh_df  <- vehicle

# Convert all features to numeric
hh_df[]   <- lapply(hh_df, function(x) as.numeric(as.character(x)))
trip_df[] <- lapply(trip_df, function(x) as.numeric(as.character(x)))
idt_df[]  <- lapply(idt_df, function(x) as.numeric(as.character(x)))
per_df[]  <- lapply(per_df, function(x) as.numeric(as.character(x)))
veh_df[]  <- lapply(veh_df, function(x) as.numeric(as.character(x)))





# Print the modified data frame
# print(hh_df)

# Checking missing values / NAs in adata 



library(VIM,quietly =T)

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

aggr(hh_df,numbers =T)

aggr(trip_df,numbers =T)

aggr(idt_df,numbers =T)

aggr(per_df,numbers =T)

aggr(veh_df,numbers =T)

library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(hh_df)  ,  tl.cex = 0.5)

corrplot(cor(trip_df),  tl.cex = 0.3)

corrplot(cor(idt_df) ,  tl.cex = 0.4)

corrplot(cor(per_df) ,  tl.cex = 0.2)

corrplot(cor(veh_df) ,  tl.cex = 0.4)

Correlation matrix top correlation coefficients:

# Compute the correlation matrix
cor_matrix <- cor(per_df, use = "complete.obs")

# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA

# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)

# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]


cor_df_per_df <- cor_df

# Display the top 10 correlations
head(cor_df_per_df, 30)

##            Var1       Var2      Freq
## 1817     R_RACE R_RACE_IMP 0.9999759
## 14687  CENSUS_D   CDIVMSAR 0.9993838
## 16497  CDIVMSAR  STRATUMID 0.9965903
## 5070   USAGE2_2   USAGE2_3 0.9964627
## 16493  CENSUS_D  STRATUMID 0.9960621
## 11179   CONDPUB  CONDSHARE 0.9957642
## 11050   CONDPUB   CONDSPEC 0.9954601
## 11180  CONDSPEC  CONDSHARE 0.9953442
## 10139  W_VISIMP    W_CHAIR 0.9953354
## 10010  W_VISIMP     W_SCCH 0.9951219
## 5329   USAGE2_3   USAGE2_5 0.9948240
## 11437   CONDPUB     CONDRF 0.9946129
## 11439 CONDSHARE     CONDRF 0.9944988
## 5200   USAGE2_3   USAGE2_4 0.9943278
## 11438  CONDSPEC     CONDRF 0.9941954
## 5328   USAGE2_2   USAGE2_5 0.9936316
## 5330   USAGE2_4   USAGE2_5 0.9934460
## 5199   USAGE2_2   USAGE2_4 0.9933281
## 6330     WORKER    PAYPROF 0.9928630
## 10140    W_SCCH    W_CHAIR 0.9920287
## 9880     W_WKCR   W_VISIMP 0.9903802
## 5458   USAGE2_3   USAGE2_6 0.9894974
## 10138    W_WKCR    W_CHAIR 0.9892183
## 5460   USAGE2_5   USAGE2_6 0.9887864
## 10009    W_WKCR     W_SCCH 0.9887686
## 5459   USAGE2_4   USAGE2_6 0.9885675
## 11440  CONDNONE     CONDRF 0.9883024
## 5457   USAGE2_2   USAGE2_6 0.9883020
## 11049  CONDRIVE   CONDSPEC 0.9882522
## 11308   CONDPUB   CONDNONE 0.9879304

# Compute the correlation matrix
cor_matrix <- cor(hh_df, use = "complete.obs")

# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA

# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)

# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]


cor_df_hh_df <- cor_df

# Display the top 10 correlations
head(cor_df_hh_df, 30)

##           Var1         Var2       Freq
## 464   CENSUS_D     CDIVMSAR  0.9993899
## 1204  CDIVMSAR    STRATUMID  0.9967489
## 1199  CENSUS_D    STRATUMID  0.9962130
## 971      URBAN       URBRUR  0.9727367
## 465   CENSUS_R     CDIVMSAR  0.9568406
## 324   CENSUS_D     CENSUS_R  0.9567655
## 1200  CENSUS_R    STRATUMID  0.9535959
## 1069    HHSIZE     RESP_CNT  0.9488492
## 576   HHFAMINC HHFAMINC_IMP  0.8608143
## 1113    URBRUR  URBRUR_2010  0.8299659
## 390    NUMADLT      DRVRCNT  0.8184127
## 1111     URBAN  URBRUR_2010  0.8182858
## 778       RAIL       MSACAT  0.7694530
## 1079    PPT517     RESP_CNT  0.7553198
## 1055   NUMADLT     RESP_CNT  0.7452636
## 999     HHSIZE       PPT517  0.7317740
## 635    NUMADLT       HHSIZE  0.7059614
## 72     WTHHFIN    WTHHFIN5D  0.6967124
## 1062   DRVRCNT     RESP_CNT  0.6801249
## 705    NUMADLT     HHRELATD -0.6705962
## 642    DRVRCNT       HHSIZE  0.6447699
## 936      URBAN    URBANSIZE  0.6334098
## 1071  HHRELATD     RESP_CNT -0.6284769
## 677    DRVRCNT     HHVEHCNT  0.6277426
## 712    DRVRCNT     HHRELATD -0.6236388
## 719     HHSIZE     HHRELATD -0.6184572
## 972  URBANSIZE       URBRUR  0.5500704
## 828     MSACAT      MSASIZE -0.5296170
## 1121   HOUSEID     TDAYDATE  0.5077024
## 216    HOMEOWN     HOMETYPE  0.4954214

# Compute the correlation matrix
cor_matrix <- cor(veh_df, use = "complete.obs")

# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA

# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)

# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]


cor_df_veh_df <- cor_df

# Display the top 10 correlations
head(cor_df_veh_df, 30)

##                Var1             Var2       Freq
## 800         HOUSEID        VEHCASEID  1.0000000
## 943         VEHYEAR           VEHAGE -0.9996603
## 1389       CENSUS_D         CDIVMSAR  0.9993866
## 2004       CDIVMSAR        STRATUMID  0.9963053
## 2000       CENSUS_D        STRATUMID  0.9956682
## 1871          URBAN           URBRUR  0.9760055
## 1248       CENSUS_D         CENSUS_R  0.9571119
## 1390       CENSUS_R         CDIVMSAR  0.9569795
## 2001       CENSUS_R        STRATUMID  0.9531951
## 432       VEHCOM_RS       VEHCOM_DEL  0.9190736
## 1339        NUMADLT          DRVRCNT  0.8761052
## 2193       HHFAMINC     HHFAMINC_IMP  0.8307284
## 1623           RAIL           MSACAT  0.7535177
## 1527        NUMADLT           HHSIZE  0.6952301
## 1533        DRVRCNT           HHSIZE  0.6906989
## 2112        WTHHFIN        WTHHFIN5D  0.6778495
## 190           VEHID         HHVEHCNT  0.6726851
## 1824          URBAN        URBANSIZE  0.6613027
## 670  COMMERCIALFREQ HHVEHUSETIME_OTH  0.6378659
## 479       VEHCOM_RS       VEHCOM_OTH  0.6152079
## 480      VEHCOM_DEL       VEHCOM_OTH  0.6081051
## 1872      URBANSIZE           URBRUR  0.5831634
## 1003       VEHOWNED         VEHOWNMO  0.5724584
## 1956        DRVRCNT         WRKCOUNT  0.5293307
## 1898      VEHCASEID         TDAYDATE  0.5150411
## 1881        HOUSEID         TDAYDATE  0.5150411
## 1950        NUMADLT         WRKCOUNT  0.4836780
## 1680         MSACAT          MSASIZE -0.4829830
## 1960         HHSIZE         WRKCOUNT  0.4788773
## 1774         MSACAT            URBAN  0.4357032

# Compute the correlation matrix
cor_matrix <- cor(trip_df, use = "complete.obs")

# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA

# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)

# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]


cor_df_trip_df <- cor_df

# Display the top 10 correlations
head(cor_df_trip_df, 30)

##           Var1         Var2       Freq
## 4081   HOUSEID     TDCASEID  1.0000000
## 5071  CENSUS_D     CDIVMSAR  0.9994084
## 258     TRIPID   SEQ_TRIPID  0.9987767
## 6180  CDIVMSAR    STRATUMID  0.9968642
## 6176  CENSUS_D    STRATUMID  0.9964242
## 5933     URBAN       URBRUR  0.9712287
## 2828  NUMONTRP     NONHHCNT  0.9666089
## 6722      PARK        PROXY -0.9661013
## 1548  STRTTIME      ENDTIME  0.9640211
## 5072  CENSUS_R     CDIVMSAR  0.9571278
## 4816  CENSUS_D     CENSUS_R  0.9569459
## 6177  CENSUS_R    STRATUMID  0.9541042
## 3266  WHODROVE WHODROVE_IMP  0.9105386
## 7201  HHFAMINC HHFAMINC_IMP  0.8531200
## 3167  TRPTRANS     PSGR_FLG -0.8383240
## 5297   ONTD_P4       HHSIZE  0.8329343
## 6688   HH_HISP       R_HISP  0.8204921
## 3150 VEHCASEID     PSGR_FLG  0.8146073
## 4983   NUMADLT      DRVRCNT  0.8120556
## 1705 VEHCASEID        VEHID  0.8077582
## 3082  TRPTRANS     DRVR_FLG -0.8034127
## 5296   ONTD_P3       HHSIZE  0.8007549
## 3956  WTTRDFIN   WTTRDFIN5D  0.7855270
## 6607   HH_RACE       R_RACE  0.7807626
## 1790 VEHCASEID     TRPTRANS -0.7804329
## 5580      RAIL       MSACAT  0.7679511
## 1793  HHMEMDRV     TRPTRANS -0.7616162
## 7060 VEHCASEID      VEHTYPE  0.7400979
## 3153  HHMEMDRV     PSGR_FLG  0.7378071
## 5298   ONTD_P5       HHSIZE  0.7329522

# Compute the correlation matrix
cor_matrix <- cor(idt_df, use = "complete.obs")

# Get the upper triangle of the correlation matrix
upper_tri <- cor_matrix
upper_tri[lower.tri(upper_tri)] <- NA

# Convert to a data frame
cor_df <- as.data.frame(as.table(upper_tri))
cor_df <- na.omit(cor_df)

# Remove self-correlations and sort by absolute correlation
cor_df <- cor_df[cor_df$Var1 != cor_df$Var2, ]
cor_df <- cor_df[order(-abs(cor_df$Freq)), ]


cor_df_idt_df <- cor_df

# Display the top 10 correlations
head(cor_df_idt_df, 30)

##          Var1         Var2      Freq
## 2543 CENSUS_D     CDIVMSAR 0.9993838
## 3537 CDIVMSAR    STRATUMID 0.9965903
## 3533 CENSUS_D    STRATUMID 0.9960621
## 1005  ONTP_P9     ONTP_P10 0.9891604
## 1606  ENDTRIP     MRT_DATE 0.9853262
## 1605  BEGTRIP     MRT_DATE 0.9849668
## 938   ONTP_P8      ONTP_P9 0.9796733
## 3349    URBAN       URBRUR 0.9728155
## 1407  BEGTRIP      ENDTRIP 0.9699927
## 1004  ONTP_P8     ONTP_P10 0.9693147
## 871   ONTP_P7      ONTP_P8 0.9604550
## 2544 CENSUS_R     CDIVMSAR 0.9564646
## 2345 CENSUS_D     CENSUS_R 0.9563994
## 1742  FARCDIV      FARCREG 0.9561699
## 3534 CENSUS_R    STRATUMID 0.9531397
## 937   ONTP_P7      ONTP_P9 0.9444249
## 1003  ONTP_P7     ONTP_P10 0.9344946
## 1541  NTSAWAY      WEEKEND 0.9026099
## 804   ONTP_P6      ONTP_P7 0.8975582
## 2007  FARCREG     GCD_FLAG 0.8828185
## 870   ONTP_P6      ONTP_P8 0.8657692
## 2680 HHFAMINC HHFAMINC_IMP 0.8532737
## 936   ONTP_P6      ONTP_P9 0.8483150
## 1002  ONTP_P6     ONTP_P10 0.8389992
## 1539  BEGTRIP      WEEKEND 0.8215350
## 1540  ENDTRIP      WEEKEND 0.8210699
## 4068  HH_RACE       R_RACE 0.8126424
## 4129  HH_HISP       R_HISP 0.8088318
## 1608  WEEKEND     MRT_DATE 0.8023114
## 737   ONTP_P5      ONTP_P6 0.7951024

# Load necessary libraries
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.3

library(patternplot)

## Warning: package 'patternplot' was built under R version 4.2.3

## 
## Attaching package: 'patternplot'

## The following object is masked from 'package:grid':
## 
##     pattern

# library(ggpattern)

# Plot 3: Boxplot of HHVEHCNT by URBAN

dataset <- veh_df



# Convert necessary columns to factors
dataset$URBAN <- factor(dataset$URBAN,
                           levels = c(1, 2, 3, 4),
                           labels = c("Urban area", 
                                      "Urban cluster", 
                                      "Area surrounded \n by urban areas", 
                                      "Not in \n urban area"))





# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset, aes(x = URBAN, y = HHVEHCNT, fill = URBAN)) +
  geom_boxplot(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Urban area" = "#1f77b4", 
                               "Urban cluster" = "#ff7f0e", 
                               "Area surrounded \n by urban areas" = "#2ca02c", 
                               "Not in \n urban area" = "#d62728")) +
  scale_y_continuous(limits= c(0,8), breaks = seq(2, max(dataset$HHVEHCNT, na.rm = TRUE), by = 1)) +
  labs(
    title = 'Boxplot of Number of Vehicles in Household by Urban Area',
    fill = "Area Type",
    x = 'Urban Area',
    y = 'Number of Vehicles in Household'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 73 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Create the boxplot of Urban area vs Vehicle year
ggplot(dataset, aes(x = URBAN, y = VEHYEAR, fill = URBAN)) +
  geom_boxplot(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Urban area" = "#1f77b4", 
                               "Urban cluster" = "#ff7f0e", 
                               "Area surrounded \n by urban areas" = "#2ca02c", 
                               "Not in \n urban area" = "#d62728")) +
  scale_y_continuous(limits= c(1980,2025), breaks = seq(min(dataset$VEHYEAR, na.rm = TRUE), max(dataset$VEHYEAR, na.rm = TRUE), by = 4)) +
  labs(
    title = 'Boxplot of Vehicle Year by Urban Area',
    fill = "Area Type",
    x = 'Urban Area',
    y = 'Vehicle Year'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

# Plot 3: Boxplot of HHVEHCNT by URBAN
dataset = 0
dataset <- trip_df



# Convert necessary columns to factors
dataset <- subset(dataset, TRPHHVEH != -1) # remove level -1

dataset$TRPHHVEH <- factor(dataset$TRPHHVEH,
                           levels = c(1, 2),
                           labels = c( 
                                      "Household Vehicle", 
                                      "Other Vehicle" 
                                      ))





# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset, aes(x = TRPHHVEH, y = TRVLCMIN, fill = TRPHHVEH)) +
  geom_boxplot(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c("Household Vehicle" = "#ff7f0e", 
                               "Other Vehicle" = "#2ca02c")) +
  scale_y_continuous(limits= c(0,100), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
  labs(
    title = 'Boxplot of Vehicles by Trip duration',
    fill = "Type",
    x = 'Vehicles Type',
    y = 'Trip duration (mins)'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 594 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Create the boxplot of Urban area vs Vehicle year
# Convert necessary columns to factors
dataset = 0
dataset <- trip_df

dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1

dataset$TRPHHVEH <- factor(dataset$TRPHHVEH,
                           levels = c(1, 2),
                           labels = c( 
                                      "Household Vehicle", 
                                      "Other Vehicle" 
                                      ))

ggplot(dataset, aes(x = VMT_MILE, y = TRVLCMIN, color = TRPHHVEH, shape = TRPHHVEH)) +
  geom_point(data = subset(dataset, TRPHHVEH == "Household Vehicle"), alpha = 0.6, size = 2, fill = NA) +
  geom_point(data = subset(dataset, TRPHHVEH == "Other Vehicle"), alpha = 0.8, size = 2) +
  theme_minimal() +
  scale_color_manual(values = c("Household Vehicle" = "#ff7f0e", "Other Vehicle" = "#2ca02c")) +
  scale_shape_manual(values = c("Household Vehicle" = 21, "Other Vehicle" = 4)) + # 21 for hollow circle, 4 for cross
  scale_x_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$VMT_MILE, na.rm = TRUE), by = 10)) +
  scale_y_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
  labs(
    title = 'Scatter Plot of Vehicles by Trip Duration',
    color = "Vehicle Type",
    shape = "Vehicle Type",
    x = 'Trip Miles (miles)',
    y = 'Trip Duration (mins)'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 135 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Create the boxplot 
# Convert necessary columns to factors

dataset = 0
dataset <- trip_df

dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1
dataset <- subset(dataset, WORKER != -1) # remove level -1


dataset$WORKER <- factor(dataset$WORKER,
                           levels = c(1, 2),
                           labels = c( 
                                      "Worker", 
                                      "Not Worker" 
                                      ))

ggplot(dataset, aes(x = VMT_MILE, y = TRVLCMIN, color = WORKER, shape = WORKER)) +
  geom_point(data = subset(dataset, WORKER == "Worker"), alpha = 0.6, size = 2, fill = NA) +
  geom_point(data = subset(dataset, WORKER == "Not Worker"), alpha = 0.6, size = 2) +
  theme_minimal() +
  scale_color_manual(values = c("Worker" = "#ff7f0e", "Not Worker" = "#2ca02c")) +
  scale_shape_manual(values = c("Worker" = 21, "Not Worker" = 4)) + # 21 for hollow circle, 4 for cross
  scale_x_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$VMT_MILE, na.rm = TRUE), by = 10)) +
  scale_y_continuous(limits= c(0, 200), breaks = seq(0, max(dataset$TRVLCMIN, na.rm = TRUE), by = 10)) +
  labs(
    title = 'Scatter Plot of Employement Status by Trip Duration',
    color = "Employement Status",
    shape = "Employement Status",
    x = 'Trip Miles (miles)',
    y = 'Trip Duration (mins)'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 94 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 54 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Create the boxplot of Urban area vs Travel Day
# Convert necessary columns to factors

dataset = 0
dataset <- trip_df


dataset <- subset(dataset, VMT_MILE != -1) # remove level -1
dataset <- subset(dataset, VMT_MILE != -9) # remove level -1
dataset <- subset(dataset, WORKER != -1) # remove level -1
dataset$WORKER <- factor(dataset$WORKER,
                           levels = c(1, 2),
                           labels = c( 
                                      "Worker", 
                                      "Not Worker" 
                                      ))

# Function to remove outliers
# remove_outliers <- function(data, column) {
#   Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
#   Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
#   IQR <- Q3 - Q1
#   data[which(data[[column]] >= (Q1 - 1.5 * IQR) & data[[column]] <= (Q3 + 1.5 * IQR)), ]
# }

# Remove outliers from dataset
# dataset_filtered <- remove_outliers(dataset, "VMT_MILE")

dataset_filtered <- dataset

# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset_filtered, aes(x = WORKER, y = VMT_MILE, fill = WORKER)) +
  geom_boxplot(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c(       
                                      "Worker" = "#00FFFF",
                                      "Not Worker"="#FF7F00"

                                      )) +
  
  
  
  
  
  
  scale_y_continuous(limits= c(0,40), breaks = seq(0, max(dataset_filtered$VMT_MILE, na.rm = TRUE), by = 5)) +
  labs(
    title = 'Employement Status by Trip Distance',
    fill = "Employement Status",
    x = 'Employement Status',
    y = 'Trip distance (miles)'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 826 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Create the boxplot of Urban area vs Travel Day
# Convert necessary columns to factors

dataset = 0
dataset <- trip_df


dataset$TRAVDAY <- factor(dataset$TRAVDAY,
                           levels = c(1, 2, 3, 4, 5, 6, 7),
                           labels = c( 
                                      "Sunday", 
                                      "Monday",
                                      "Tuesday",
                                      "Wednesday",
                                      "Thursday",
                                      "Friday",
                                      "Saturday"
                                      ))

# Function to remove outliers
remove_outliers <- function(data, column) {
  Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  data[which(data[[column]] >= (Q1 - 1.5 * IQR) & data[[column]] <= (Q3 + 1.5 * IQR)), ]
}

# Remove outliers from dataset
dataset_filtered <- remove_outliers(dataset, "VMT_MILE")

# Create the boxplot of Urban area vs Vehicle count
ggplot(dataset_filtered, aes(x = TRAVDAY, y = VMT_MILE, fill = TRAVDAY)) +
  geom_boxplot(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c(       "Sunday"= "#0000FF", 
                                      "Monday"= "#007FFF",
                                      "Tuesday" = "#00FFFF",
                                      "Wednesday"= "#7FFF7F",
                                      "Thursday"= "#FFFF00",
                                      "Friday"="#FF7F00",
                                      "Saturday"= "#FF0000"
                                      )) +
  
  
  
  
  
  
  scale_y_continuous(limits= c(0,10), breaks = seq(0, max(dataset_filtered$VMT_MILE, na.rm = TRUE), by = 2)) +
  labs(
    title = 'Boxplot of Days by Trip Distance',
    fill = "Type",
    x = 'Day',
    y = 'Trip distance (miles)'
  ) +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title.x = element_text(hjust = 0.5),
    axis.title.y = element_text(hjust = 0.5)
  )

## Warning: Removed 13772 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Data Exploration

Saurabh Shirish Prabhu

2024-07-21