This R Markdown Notebook is my report for the Data Wrangling with R class assignment for Week 3.
The following report summarizes good practices when beginning work with a new data set.
For this report we are looking at the daily average temperatures for the city of Cincinnati, Ohio from The University of Dayton archives.
The archive contains data from 1995-present and is updated on a regular basis.
library('gdata')
library('ggplot2')
library('lubridate')
library('scales')
A brief summary of the data:
url <- "http://academic.udayton.edu/kissock/http/Weather/gsod95-current/OHCINCIN.txt"
colNames <- c("Month", "Date", "Year", "Temperature")
cincy_weather <- read.table(url, col.names = colNames, na = "-99")
# number of rows
nrow(cincy_weather)
## [1] 7963
# number of columns
ncol(cincy_weather)
## [1] 4
# column names
names(cincy_weather)
## [1] "Month" "Date" "Year" "Temperature"
# structure of data
str(cincy_weather, give.attr = F)
## 'data.frame': 7963 obs. of 4 variables:
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 1995 1995 1995 1995 1995 1995 1995 1995 1995 1995 ...
## $ Temperature: num 41.1 22.2 22.8 14.9 9.5 23.8 31.1 26.9 31.3 31.5 ...
# sample rows
head(cincy_weather)
# summary stats
summary(cincy_weather)
## Month Date Year Temperature
## Min. : 1.000 Min. : 1.00 Min. :1995 Min. :-2.20
## 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2000 1st Qu.:40.20
## Median : 6.000 Median :16.00 Median :2005 Median :57.10
## Mean : 6.479 Mean :15.72 Mean :2005 Mean :54.73
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:2011 3rd Qu.:70.70
## Max. :12.000 Max. :31.00 Max. :2016 Max. :89.20
## NA's :14
# missing values
sum(is.na(cincy_weather))
## [1] 14
We can see that all the missing values are under temperature, let us impute the missing temperatures with the average temperature for that week.
missing_temps <- which(is.na(cincy_weather$Temperature))
week_average <- unlist(lapply(missing_temps, function(x){
week_start <- max(c(0, x-3))
week_end <- min(c(x+3, nrow(cincy_weather)))
imputed_temp <- mean(cincy_weather$Temperature[week_start:week_end], na.rm = T)
return(imputed_temp)
}))
cincy_weather$Temperature[missing_temps] <- week_average
summary(cincy_weather)
## Month Date Year Temperature
## Min. : 1.000 Min. : 1.00 Min. :1995 Min. :-2.20
## 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2000 1st Qu.:40.20
## Median : 6.000 Median :16.00 Median :2005 Median :57.10
## Mean : 6.479 Mean :15.72 Mean :2005 Mean :54.73
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:2011 3rd Qu.:70.75
## Max. :12.000 Max. :31.00 Max. :2016 Max. :89.20
complete_cincy_weather <- cincy_weather[cincy_weather$Year < 2016,]
mean_temp <- round(mean(complete_cincy_weather$Temperature),2)
ggplot(data = complete_cincy_weather) +
geom_line(mapping = aes(x = complete_cincy_weather$Year, y = complete_cincy_weather$Temperature),
color = "#428bca",
stat = "summary", fun.y = "mean") +
geom_hline(mapping = aes(yintercept = mean_temp),
color = "#5cb85c") +
geom_text(aes(1995, mean_temp, label = mean_temp, vjust = -1), size = 3) +
labs(x = "Year", y = "Temperature") +
ggtitle('Average Yearly Temperature')
The above plot shows the change in yearly average temperature over the last 20 years. ***
complete_cincy_weather$Decade = ifelse(
complete_cincy_weather$Year < 2000,
1990,
ifelse(complete_cincy_weather$Year < 2010, 2000, 2010)
)
complete_cincy_weather$Decade <- as.factor(complete_cincy_weather$Decade)
ggplot(data = complete_cincy_weather) +
geom_boxplot(mapping = aes(x = as.factor(complete_cincy_weather$Year),
y = complete_cincy_weather$Temperature,
fill = complete_cincy_weather$Decade),
outlier.shape = 1, outlier.size = 1
) +
scale_fill_manual(values = c("#9068be", "#6ed3cf", "#e62739")) +
scale_x_discrete(breaks = seq(1995,2015,5)) +
labs(x = "Decade", y = "Temperature") +
ggtitle('Temperature Variation Across Decades') +
guides(fill = guide_legend(title = "Decade", title.position = "top"))
The above plot shows the temparature variation across years and is colored by decade.
ggplot(data = complete_cincy_weather) +
geom_bar(mapping = aes(x = as.factor(Month), y = Temperature),
fill = "#6ed3cf",
stat = "summary",
fun.y = "mean") +
scale_x_discrete(breaks=1:12,
labels=month.abb) +
geom_hline(mapping = aes(yintercept = mean_temp),
color = "#5cb85c") +
geom_text(aes(1, mean_temp+2, label = mean_temp, vjust = -1), size = 2) +
labs(x = "Month", y = "Temperature") +
ggtitle('Temperature Variation Across Months') +
coord_flip()
The above plot show the temperature variation between months over the last 20 years.