##1 Describe your data set This dataset contains hourly data on the traffic volume for westbound I-94, a major interstate highway in the US that connects Minneapolis and St Paul, Minnesota. The data was collected by the Minnesota Department of Transportation (MnDOT) from 2012 to 2018 at a station roughly midway between the two cities.
holiday: a categorical variable that indicates whether the date is a US national holiday or a regional holiday (such as the Minnesota State Fair).
temp: a numeric variable that shows the average temperature in kelvin.
rain_1h: a numeric variable that shows the amount of rain in mm that occurred in the hour.
snow_1h: a numeric variable that shows the amount of snow in mm that occurred in the hour.
clouds_all: a numeric variable that shows the percentage of cloud cover.
weather_main: a categorical variable that gives a short textual description of the current weather (such as Clear, Clouds, Rain, etc.).
weather_description: a categorical variable that gives a longer textual description of the current weather (such as light rain, overcast clouds, etc.).
date_time: a datetime variable that shows the hour of the data collected in local CST time.
traffic_volume: a numeric variable that shows the hourly I-94 reported westbound traffic volume.
#2
library(readr)
library(knitr)
traffic <- read_csv("Metro_Interstate_Traffic_Volume.csv")
## Rows: 48204 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): holiday, weather_main, weather_description, date_time
## dbl (5): traffic_volume, temp, rain_1h, snow_1h, clouds_all
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
total_nas <- sum(is.na(traffic))
print(total_nas)
## [1] 0
This dataset has no NA values
#3 I am going to answer to questions about traffic with this dataset ##1 What hour of the day has the highest traffic volume on westbound I-94
##2 What weather causes the lowest traffic volume?
#4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 1. Formulate your question
#1 What hour of the day has the highest traffic volume?
# 2. Read in your data
#Already read in
# 3. Check the dimensions
ncol(traffic)
## [1] 9
nrow(traffic)
## [1] 48204
# 4. Look at the head and tail
head(traffic,n=50)
## # A tibble: 50 × 9
## traffic_volume holiday temp rain_1h snow_1h clouds_all weather_main
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5545 None 288. 0 0 40 Clouds
## 2 4516 None 289. 0 0 75 Clouds
## 3 4767 None 290. 0 0 90 Clouds
## 4 5026 None 290. 0 0 90 Clouds
## 5 4918 None 291. 0 0 75 Clouds
## 6 5181 None 292. 0 0 1 Clear
## 7 5584 None 293. 0 0 1 Clear
## 8 6015 None 294. 0 0 1 Clear
## 9 5791 None 294. 0 0 20 Clouds
## 10 4770 None 293. 0 0 20 Clouds
## # ℹ 40 more rows
## # ℹ 2 more variables: weather_description <chr>, date_time <chr>
tail(traffic,n=50)
## # A tibble: 50 × 9
## traffic_volume holiday temp rain_1h snow_1h clouds_all weather_main
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 867 None 275. 0 0 1 Clear
## 2 524 None 275. 0 0 1 Clear
## 3 359 None 275. 0 0 1 Clear
## 4 425 None 274. 0 0 1 Clear
## 5 743 None 275. 0 0 1 Clear
## 6 1359 None 275. 0 0 1 Clear
## 7 2036 None 275. 0 0 1 Clear
## 8 3073 None 275. 0 0 75 Clouds
## 9 3725 None 276. 0 0 90 Clouds
## 10 4059 None 277. 0 0 1 Clear
## # ℹ 40 more rows
## # ℹ 2 more variables: weather_description <chr>, date_time <chr>
# 5. Check the data types
str(traffic)
## spc_tbl_ [48,204 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ traffic_volume : num [1:48204] 5545 4516 4767 5026 4918 ...
## $ holiday : chr [1:48204] "None" "None" "None" "None" ...
## $ temp : num [1:48204] 288 289 290 290 291 ...
## $ rain_1h : num [1:48204] 0 0 0 0 0 0 0 0 0 0 ...
## $ snow_1h : num [1:48204] 0 0 0 0 0 0 0 0 0 0 ...
## $ clouds_all : num [1:48204] 40 75 90 90 75 1 1 1 20 20 ...
## $ weather_main : chr [1:48204] "Clouds" "Clouds" "Clouds" "Clouds" ...
## $ weather_description: chr [1:48204] "scattered clouds" "broken clouds" "overcast clouds" "overcast clouds" ...
## $ date_time : chr [1:48204] "02-10-2012 09:00" "02-10-2012 10:00" "02-10-2012 11:00" "02-10-2012 12:00" ...
## - attr(*, "spec")=
## .. cols(
## .. traffic_volume = col_double(),
## .. holiday = col_character(),
## .. temp = col_double(),
## .. rain_1h = col_double(),
## .. snow_1h = col_double(),
## .. clouds_all = col_double(),
## .. weather_main = col_character(),
## .. weather_description = col_character(),
## .. date_time = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#I want to pull the hour from date_time column to find the hour that has the most traffic volume
# Convert the "date_time" column to a POSIXct datetime object
traffic$date_time <- as.POSIXct(traffic$date_time, format = "%m-%d-%Y %H:%M")
# Extract the hour and convert it to a categorical variable
traffic$hour <- as.factor(format(traffic$date_time, "%H"))
#I want to also know the holidays in this dataset
print(unique(traffic$holiday))
## [1] "None" "Columbus Day"
## [3] "Veterans Day" "Thanksgiving Day"
## [5] "Christmas Day" "New Years Day"
## [7] "Washingtons Birthday" "Memorial Day"
## [9] "Independence Day" "State Fair"
## [11] "Labor Day" "Martin Luther King Jr Day"
# 6. Summary statistics
print(summary(traffic))
## traffic_volume holiday temp rain_1h
## Min. : 0 Length:48204 Min. : 0.0 Min. : 0.000
## 1st Qu.:1193 Class :character 1st Qu.:272.2 1st Qu.: 0.000
## Median :3380 Mode :character Median :282.4 Median : 0.000
## Mean :3260 Mean :281.2 Mean : 0.334
## 3rd Qu.:4933 3rd Qu.:291.8 3rd Qu.: 0.000
## Max. :7280 Max. :310.1 Max. :9831.300
##
## snow_1h clouds_all weather_main weather_description
## Min. :0.0000000 Min. : 0.00 Length:48204 Length:48204
## 1st Qu.:0.0000000 1st Qu.: 1.00 Class :character Class :character
## Median :0.0000000 Median : 64.00 Mode :character Mode :character
## Mean :0.0002224 Mean : 49.36
## 3rd Qu.:0.0000000 3rd Qu.: 90.00
## Max. :0.5100000 Max. :100.00
##
## date_time hour
## Min. :2012-01-11 00:00:00.00 04 : 824
## 1st Qu.:2014-02-07 03:00:00.00 07 : 812
## Median :2016-05-06 21:30:00.00 01 : 810
## Mean :2015-12-15 07:55:34.32 23 : 810
## 3rd Qu.:2017-07-12 12:45:00.00 08 : 809
## Max. :2018-12-09 23:00:00.00 (Other):14865
## NA's :29274 NA's :29274
# 7. Visualizations
# Create a plot
# Convert the "date_time" column to a POSIXct datetime object
traffic$date_time <- as.POSIXct(traffic$date_time, format = "%m-%d-%Y %H:%M")
# Create a plot
plot(traffic$hour, traffic$traffic_volume,
xlab = "Year", ylab = "Traffic Volume",
main = "Scatter Plot of Traffic Volume by hour", col="yellow")
traffic_by_weather <- aggregate(traffic$traffic_volume,
by = list(weather_main = traffic$weather_main),
FUN = mean)
hourly_traffic <- aggregate(traffic$traffic_volume,
by = list(hour = traffic$hour),
FUN = sum)
# Find the hour with the maximum traffic volume
max_hour <- hourly_traffic$hour[which.max(hourly_traffic$x)]
print(max_hour)
## [1] 16
## 24 Levels: 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 ... 23
# Create a bar plot to see traffic volume by weather
unique(traffic$weather_main)
## [1] "Clouds" "Clear" "Rain" "Drizzle" "Mist"
## [6] "Haze" "Fog" "Thunderstorm" "Snow" "Squall"
## [11] "Smoke"
barplot(traffic_by_weather$x, names.arg = traffic_by_weather$weather_main,
xlab = "Weather Main", ylab = "Mean Traffic Volume",
main = "Traffic Volume by Weather Main",
col = rainbow(length(traffic_by_weather$weather_main)),las = 2) #added las = 2 to view all weather types
min_index <- which.min(traffic_by_weather$x)
# Get the weather condition with the minimum mean traffic volume
weather_with_min_volume <- traffic_by_weather$weather_main[min_index]
print(weather_with_min_volume)
## [1] "Squall"
#You can see from the bar plot and finding the weather with the minimum traffic volume that a squall causes the lowest traffic volume
# 8. Deal with missing values
#I dont have missing values
# 9. Consider transformations
#I created a hour categorical variable by pulling the hour from the date time variable in the data frame
# 10. Think about your results
# Reflect on your initial findings and decide if you need to refine your analysis further.
#5 Observations from Data:
#6 Is there a statistically significant difference in the mean traffic volume when there is precipitation or no precipitation on westbound I-94?
#Null Hypothesis (H0): There is no significant difference in the mean traffic volume between days with precipitation and days without precipitation.
#Alternative Hypothesis (Ha): There is a significant difference between days with precipitation and days without precipitation.
traffic$precipitation <- ifelse(traffic$rain_1h > 0 | traffic$snow_1h > 0, "yes", "no") #creates a column sharing whether the entry had precipitation or not
# Subset data for yes precipitation and no precipitation
yes_precip <- subset(traffic, precipitation %in% c("yes"))
no_precip <- subset(traffic, precipitation %in% c("no"))
# Perform Two-sample t-test
t_test_result <- t.test(yes_precip$traffic_volume, no_precip$traffic_volume)
# Print the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: yes_precip$traffic_volume and no_precip$traffic_volume
## t = 0.72544, df = 4088.2, p-value = 0.4682
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -43.33564 94.24224
## sample estimates:
## mean of x mean of y
## 3283.410 3257.957
#Based on the p-value of .4682 I reject the alternative hypothesis
# Aggregate mean traffic volume by precipitation category
mean_traffic_by_precipitation <- aggregate(traffic$traffic_volume,
by = list(precipitation = traffic$precipitation),
FUN = mean)
# Create a bar plot
barplot(mean_traffic_by_precipitation$x,
names.arg = mean_traffic_by_precipitation$precipitation,
xlab = "Precipitation", ylab = "Mean Traffic Volume",
main = "Mean Traffic Volume by Precipitation",
col = c("#fed1ff", "#deda92"), #interesting you can color plots with hexidecimal :)
legend = c("No", "Yes"))
#The plot shows the mean difference is miniscule
#Conclusion After analyzing the data and conducting a statistical hypothesis test, we find that there is no statistically significant difference in mean traffic volume between days with precipitation and days without.
This finding has practical implications for traffic management and planning. While precipitation can often lead to concerns about reduced visibility and challenging road conditions, this analysis indicates that it might not be a significant driver of fluctuations in traffic volume. This knowledge can guide decision-makers in allocating resources and implementing strategies to manage traffic flow more effectively during rainy or snowy periods.
#7 Wanting to find if there is a correlation between traffic volume and cloud coverage
plot(traffic$clouds_all, traffic$traffic_volume,
xlab = "Cloud coverage", ylab = "Traffic Volume",
main = "Traffic Volume by cloud coverage")
linear_model <- lm(traffic$traffic_volume ~ traffic$clouds_all, data = traffic)
summary(linear_model)
##
## Call:
## lm(formula = traffic$traffic_volume ~ traffic$clouds_all, data = traffic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3398.6 -2034.4 125.7 1669.8 4149.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3091.2619 14.5613 212.29 <2e-16 ***
## traffic$clouds_all 3.4147 0.2314 14.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1982 on 48202 degrees of freedom
## Multiple R-squared: 0.004496, Adjusted R-squared: 0.004476
## F-statistic: 217.7 on 1 and 48202 DF, p-value: < 2.2e-16
#The p-value for the overall test and the variables is <2e-16 and would mean the traffic volume as a predictor variable is significant, but I would use the p-value with the r-value to strengthen its validity
#The r-squared value is .004476 and means that there is a very weak correlation between traffic volume and cloud coverage
#8 There are few ethical concerns with collecting traffic data
Privacy and Personal Data: Collecting and storing data related to traffic patterns and volume could inadvertently include information about individuals’ travel habits, locations, and potentially even identifiable information. There’s a risk of breaching privacy if this data is not properly anonymized and safeguarded. Ensuring that any personal or sensitive information is adequately protected is a crucial ethical consideration.
Public Safety and Security: If the dataset contains sensitive information about traffic patterns, there’s a potential risk that this data could be exploited by malicious actors for harmful purposes, such as planning criminal activities or targeting vulnerable areas.