Loading Libraries

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.3

CSV Data

Data was entered from https://www.usclimatedata.com/ in a csv file called ClimateData.csv using the wide format given

Read CSV into R, tidy and transform data

CSV read into R

dfWide<-read.csv(file="https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/ClimateData.csv")
dfWide
##                           X   Jan   Feb    Mar    Apr    May    Jun    Jul
## 1        Average high in ºF 42.00 44.00  53.00  64.00  75.00  83.00  87.00
## 2         Average low in ºF 27.00 28.00  35.00  44.00  54.00  63.00  68.00
## 3   Days with precipitation 11.00  8.00  12.00  10.00  12.00   9.00  10.00
## 4         Hours of sunshine 76.00 97.00 135.00 182.00 221.00 214.00 226.00
## 5 Av. precipitation in inch  3.03  2.48   3.23   3.15   4.13   3.23   4.13
##      Aug    Sep    Oct   Nov  Dec
## 1  84.00  78.00  67.00 55.00 45.0
## 2  66.00  59.00  48.00 38.00 29.0
## 3  10.00   8.00   8.00  8.00  9.0
## 4 186.00 170.00 123.00 87.00 66.0
## 5   4.88   3.82   3.07  2.83  2.8
tbWide<-tbl_df(dfWide)
tbWide
## # A tibble: 5 x 13
##   X       Jan   Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
##   <fct> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 Aver~ 42    44     53     64     75     83     87     84     78     67   
## 2 Aver~ 27    28     35     44     54     63     68     66     59     48   
## 3 Days~ 11     8     12     10     12      9     10     10      8      8   
## 4 Hour~ 76    97    135    182    221    214    226    186    170    123   
## 5 Av. ~  3.03  2.48   3.23   3.15   4.13   3.23   4.13   4.88   3.82   3.07
## # ... with 2 more variables: Nov <dbl>, Dec <dbl>

Data Transformation

We reshape the data by going from a wide table to a tidy table with variables as columns and observations as rows, and renaming the x column.

clean <- dfWide %>% rename("Climate" = X)

clean
##                     Climate   Jan   Feb    Mar    Apr    May    Jun    Jul
## 1        Average high in ºF 42.00 44.00  53.00  64.00  75.00  83.00  87.00
## 2         Average low in ºF 27.00 28.00  35.00  44.00  54.00  63.00  68.00
## 3   Days with precipitation 11.00  8.00  12.00  10.00  12.00   9.00  10.00
## 4         Hours of sunshine 76.00 97.00 135.00 182.00 221.00 214.00 226.00
## 5 Av. precipitation in inch  3.03  2.48   3.23   3.15   4.13   3.23   4.13
##      Aug    Sep    Oct   Nov  Dec
## 1  84.00  78.00  67.00 55.00 45.0
## 2  66.00  59.00  48.00 38.00 29.0
## 3  10.00   8.00   8.00  8.00  9.0
## 4 186.00 170.00 123.00 87.00 66.0
## 5   4.88   3.82   3.07  2.83  2.8

I transformed the data by organizing the 5 columns by month (as rows) to better visualize the climate data.

cleaner <- clean %>% gather(`Jan`:`Dec`, key = Month , value = counts)
tidy <- cleaner %>% spread(Climate, counts)
tidy
##    Month Av. precipitation in inch Average high in ºF Average low in ºF
## 1    Apr                      3.15                 64                44
## 2    Aug                      4.88                 84                66
## 3    Dec                      2.80                 45                29
## 4    Feb                      2.48                 44                28
## 5    Jan                      3.03                 42                27
## 6    Jul                      4.13                 87                68
## 7    Jun                      3.23                 83                63
## 8    Mar                      3.23                 53                35
## 9    May                      4.13                 75                54
## 10   Nov                      2.83                 55                38
## 11   Oct                      3.07                 67                48
## 12   Sep                      3.82                 78                59
##    Days with precipitation Hours of sunshine
## 1                       10               182
## 2                       10               186
## 3                        9                66
## 4                        8                97
## 5                       11                76
## 6                       10               226
## 7                        9               214
## 8                       12               135
## 9                       12               221
## 10                       8                87
## 11                       8               123
## 12                       8               170
tidy$Month <- factor(tidy$Month, month.abb, ordered=TRUE)
tidy[order(tidy$Month), ]
##    Month Av. precipitation in inch Average high in ºF Average low in ºF
## 5    Jan                      3.03                 42                27
## 4    Feb                      2.48                 44                28
## 8    Mar                      3.23                 53                35
## 1    Apr                      3.15                 64                44
## 9    May                      4.13                 75                54
## 7    Jun                      3.23                 83                63
## 6    Jul                      4.13                 87                68
## 2    Aug                      4.88                 84                66
## 12   Sep                      3.82                 78                59
## 11   Oct                      3.07                 67                48
## 10   Nov                      2.83                 55                38
## 3    Dec                      2.80                 45                29
##    Days with precipitation Hours of sunshine
## 5                       11                76
## 4                        8                97
## 8                       12               135
## 1                       10               182
## 9                       12               221
## 7                        9               214
## 6                       10               226
## 2                       10               186
## 12                       8               170
## 11                       8               123
## 10                       8                87
## 3                        9                66
summary(tidy)
##      Month   Av. precipitation in inch Average high in ºF Average low in ºF
##  Jan    :1   Min.   :2.480             Min.   :42.00      Min.   :27.00    
##  Feb    :1   1st Qu.:2.980             1st Qu.:51.00      1st Qu.:33.50    
##  Mar    :1   Median :3.190             Median :65.50      Median :46.00    
##  Apr    :1   Mean   :3.398             Mean   :64.75      Mean   :46.58    
##  May    :1   3rd Qu.:3.897             3rd Qu.:79.25      3rd Qu.:60.00    
##  Jun    :1   Max.   :4.880             Max.   :87.00      Max.   :68.00    
##  (Other):6                                                                 
##  Days with precipitation Hours of sunshine
##  Min.   : 8.000          Min.   : 66.0    
##  1st Qu.: 8.000          1st Qu.: 94.5    
##  Median : 9.500          Median :152.5    
##  Mean   : 9.583          Mean   :148.6    
##  3rd Qu.:10.250          3rd Qu.:193.0    
##  Max.   :12.000          Max.   :226.0    
## 

Data Analysis and Visualization

From the summary, there appears to be a relationship between average precipittion and hours of sunshine. I grouped those variables and used ggplot to confirm.

tidy2 <- tidy %>% group_by(Month, `Av. precipitation in inch` ) %>% summarise(`Hours of sunshine`)

tidy2
## # A tibble: 12 x 3
## # Groups:   Month [12]
##    Month `Av. precipitation in inch` `Hours of sunshine`
##    <ord>                       <dbl>               <dbl>
##  1 Jan                          3.03                  76
##  2 Feb                          2.48                  97
##  3 Mar                          3.23                 135
##  4 Apr                          3.15                 182
##  5 May                          4.13                 221
##  6 Jun                          3.23                 214
##  7 Jul                          4.13                 226
##  8 Aug                          4.88                 186
##  9 Sep                          3.82                 170
## 10 Oct                          3.07                 123
## 11 Nov                          2.83                  87
## 12 Dec                          2.8                   66
g=ggplot(tidy2, aes(x=`Av. precipitation in inch`,y=`Hours of sunshine`,group = Month, color = Month))
g=g+geom_point(stat="identity",size=8)
g=g+ggtitle("Precipitation Due to Sunshine")
g=g+ylab("Hours of Sunshine")+xlab("Av. precipitation in inch")
g=g+theme_get()
g=g+theme(plot.title = element_text(hjust = 0.5),text=element_text(size=13))
g  

g