Synopsis

In this Week 3 Homework we scrape the Cincinnati Weather data from the site “http://academic.udayton.edu/kissock/http/Weather/gsod95-current/OHCINCIN.txt” and explore the data, prepare the data for analysis and finally visualize the data using ggplot2 package to derive some insights into the data.

Packages Required

The following packages are required:

library(tidyverse) # to perform visaualizations using ggplot

Source code

There are four variables in the data: Month, Day,Year and Avg_Temp The Month day and Year variables together give the date for each observation.The Avg_Temp variable is Daily average temperature (in Farenheit) that is computed from 24 hourly temperature readings per day from the Global Summary of the Day (GSOD) data.Here missing data was coded as -99 which I have changed back to NA. Link to the data codebook:http://academic.udayton.edu/kissock/http/Weather/

Data Description

Let us first scrape the data and then look into it.

#scraping data

url<-"http://academic.udayton.edu/kissock/http/Weather/gsod95-current/OHCINCIN.txt"
cin_weather<-read.table(url,stringsAsFactors = FALSE)
names(cin_weather)<-c("Month","Day","Year","Avg_Temp")

#let us look at the rows and columns in the data

nrow(cin_weather)
## [1] 7963
ncol(cin_weather)
## [1] 4
# We can observe the above two together as the dimensions of the table

dim(cin_weather)
## [1] 7963    4
#let us look at the variables present

names(cin_weather)
## [1] "Month"    "Day"      "Year"     "Avg_Temp"
#looking at the structure of the data

str(cin_weather)
## 'data.frame':    7963 obs. of  4 variables:
##  $ Month   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Day     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year    : int  1995 1995 1995 1995 1995 1995 1995 1995 1995 1995 ...
##  $ Avg_Temp: num  41.1 22.2 22.8 14.9 9.5 23.8 31.1 26.9 31.3 31.5 ...
#looking at teh first and last few rows of dataset

head (cin_weather)
##   Month Day Year Avg_Temp
## 1     1   1 1995     41.1
## 2     1   2 1995     22.2
## 3     1   3 1995     22.8
## 4     1   4 1995     14.9
## 5     1   5 1995      9.5
## 6     1   6 1995     23.8
tail (cin_weather)
##      Month Day Year Avg_Temp
## 7958    10  14 2016     54.4
## 7959    10  15 2016     63.2
## 7960    10  16 2016     68.7
## 7961    10  17 2016     71.1
## 7962    10  18 2016     74.4
## 7963    10  19 2016     75.3
#let us check and count missing values.As the missing values were initially coded as -99 we change them NA and continue the check

cin_weather[cin_weather == -99]<-NA

sum(is.na(cin_weather))
## [1] 14
#My hunch is all these missing values are in the recorded temperature variable and thus we check the individul variables

sum(is.na(cin_weather$Month))
## [1] 0
sum(is.na(cin_weather$Day))
## [1] 0
sum(is.na(cin_weather$`Avg_Temp`))
## [1] 14
#As we now know the missing values indeed belong to the temperature variable let us look at which rows have the missing values

which(is.na(cin_weather$`Avg_Temp`))
##  [1] 1454 1455 1460 1461 1471 2726 2727 2728 2729 2807 2982 4623 5016 5213
# TO view the missing value rows

cin_weather[!complete.cases(cin_weather),]
##      Month Day Year Avg_Temp
## 1454    12  24 1998       NA
## 1455    12  25 1998       NA
## 1460    12  30 1998       NA
## 1461    12  31 1998       NA
## 1471     1  10 1999       NA
## 2726     6  18 2002       NA
## 2727     6  19 2002       NA
## 2728     6  20 2002       NA
## 2729     6  21 2002       NA
## 2807     9   7 2002       NA
## 2982     3   1 2003       NA
## 4623     8  28 2007       NA
## 5016     9  24 2008       NA
## 5213     4   9 2009       NA
#We can omit these few observations and get data without NA values

clean_cin<-na.omit(cin_weather)

sum(is.na(clean_cin))
## [1] 0

As Month, Day and Year are not continuous variables we view them as categorical variables in the summary:

clean_cin$Month<-factor(clean_cin$Month)

clean_cin$Day<-factor(clean_cin$Day)

clean_cin$Year<-factor(clean_cin$Year)

#to view a summary of the variables:

summary (clean_cin)
##      Month           Day            Year         Avg_Temp    
##  5      : 682   2      : 262   1996   : 366   Min.   :-2.20  
##  7      : 682   3      : 262   2000   : 366   1st Qu.:40.20  
##  1      : 681   4      : 262   2004   : 366   Median :57.10  
##  3      : 681   5      : 262   2012   : 366   Mean   :54.73  
##  8      : 681   6      : 262   1995   : 365   3rd Qu.:70.70  
##  10     : 670   8      : 262   1997   : 365   Max.   :89.20  
##  (Other):3872   (Other):6377   (Other):5755

The non continuous variables be viewed better(complete count of summary) as follows:

table(clean_cin$Month)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 681 622 681 659 682 656 682 681 658 670 630 647
table(clean_cin$Day)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 261 262 262 262 262 262 261 262 261 261 262 262 262 262 262 262 262 261 
##  19  20  21  22  23  24  25  26  27  28  29  30  31 
## 261 260 260 261 261 259 260 261 261 260 245 238 151
table(clean_cin$Year)
## 
## 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 
##  365  366  365  361  364  366  365  360  364  366  365  365  364  365  364 
## 2010 2011 2012 2013 2014 2015 2016 
##  365  365  366  365  365  365  293

Data Visualization

Now let us visualize the data in a graphical manner using ggplot

#Viz 1: This shows the Min max and median  of temperatures across the 12 months per each year from 1995-2016.As 2016 is still in progress we have data only till October.We can see every year the max temp is in june-july  months(around 75 degrees) and the lowest is in December-January( around 25degrees).THis is exactly what we expect.

ggplot(data = clean_cin) + 
  stat_summary(
    mapping = aes(x = Month, y = Avg_Temp),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )+facet_wrap(~ Year, nrow = 4)

#Viz 2: This viz gives us a relation between the various dates in a  month
#and the temperatures on each day colored by the month.We see that across all the days in a month temperature is more or less same. For eaxample, in January we find the temp across all the days is low .And in July throughout the month the temp was high. This again is expected.

ggplot(data=clean_cin) +  geom_point(mapping = aes(x = Day, y = Avg_Temp,color=Month))

#Viz 3: This viz gives a pictorial view of the average temperature in relation with the year  year and how many times in the year was that temperature a we see in in 2012 we had the highest no of days (around 39)having a temp of around 73degrees.

ggplot(data = clean_cin) + 
  geom_bar(mapping = aes(x =Avg_Temp,fill=Year))