library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.3
Data was entered from https://www.usclimatedata.com/ in a csv file called ClimateData.csv using the wide format given
CSV read into R
dfWide<-read.csv(file="https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/ClimateData.csv")
dfWide
## X Jan Feb Mar Apr May Jun Jul
## 1 Average high in ºF 42.00 44.00 53.00 64.00 75.00 83.00 87.00
## 2 Average low in ºF 27.00 28.00 35.00 44.00 54.00 63.00 68.00
## 3 Days with precipitation 11.00 8.00 12.00 10.00 12.00 9.00 10.00
## 4 Hours of sunshine 76.00 97.00 135.00 182.00 221.00 214.00 226.00
## 5 Av. precipitation in inch 3.03 2.48 3.23 3.15 4.13 3.23 4.13
## Aug Sep Oct Nov Dec
## 1 84.00 78.00 67.00 55.00 45.0
## 2 66.00 59.00 48.00 38.00 29.0
## 3 10.00 8.00 8.00 8.00 9.0
## 4 186.00 170.00 123.00 87.00 66.0
## 5 4.88 3.82 3.07 2.83 2.8
tbWide<-tbl_df(dfWide)
tbWide
## # A tibble: 5 x 13
## X Jan Feb Mar Apr May Jun Jul Aug Sep Oct
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aver~ 42 44 53 64 75 83 87 84 78 67
## 2 Aver~ 27 28 35 44 54 63 68 66 59 48
## 3 Days~ 11 8 12 10 12 9 10 10 8 8
## 4 Hour~ 76 97 135 182 221 214 226 186 170 123
## 5 Av. ~ 3.03 2.48 3.23 3.15 4.13 3.23 4.13 4.88 3.82 3.07
## # ... with 2 more variables: Nov <dbl>, Dec <dbl>
We reshape the data by going from a wide table to a tidy table with variables as columns and observations as rows, and renaming the x column.
clean <- dfWide %>% rename("Climate" = X)
clean
## Climate Jan Feb Mar Apr May Jun Jul
## 1 Average high in ºF 42.00 44.00 53.00 64.00 75.00 83.00 87.00
## 2 Average low in ºF 27.00 28.00 35.00 44.00 54.00 63.00 68.00
## 3 Days with precipitation 11.00 8.00 12.00 10.00 12.00 9.00 10.00
## 4 Hours of sunshine 76.00 97.00 135.00 182.00 221.00 214.00 226.00
## 5 Av. precipitation in inch 3.03 2.48 3.23 3.15 4.13 3.23 4.13
## Aug Sep Oct Nov Dec
## 1 84.00 78.00 67.00 55.00 45.0
## 2 66.00 59.00 48.00 38.00 29.0
## 3 10.00 8.00 8.00 8.00 9.0
## 4 186.00 170.00 123.00 87.00 66.0
## 5 4.88 3.82 3.07 2.83 2.8
I transformed the data by organizing the 5 columns by month (as rows) to better visualize the climate data.
cleaner <- clean %>% gather(`Jan`:`Dec`, key = Month , value = counts)
tidy <- cleaner %>% spread(Climate, counts)
tidy
## Month Av. precipitation in inch Average high in ºF Average low in ºF
## 1 Apr 3.15 64 44
## 2 Aug 4.88 84 66
## 3 Dec 2.80 45 29
## 4 Feb 2.48 44 28
## 5 Jan 3.03 42 27
## 6 Jul 4.13 87 68
## 7 Jun 3.23 83 63
## 8 Mar 3.23 53 35
## 9 May 4.13 75 54
## 10 Nov 2.83 55 38
## 11 Oct 3.07 67 48
## 12 Sep 3.82 78 59
## Days with precipitation Hours of sunshine
## 1 10 182
## 2 10 186
## 3 9 66
## 4 8 97
## 5 11 76
## 6 10 226
## 7 9 214
## 8 12 135
## 9 12 221
## 10 8 87
## 11 8 123
## 12 8 170
tidy$Month <- factor(tidy$Month, month.abb, ordered=TRUE)
tidy[order(tidy$Month), ]
## Month Av. precipitation in inch Average high in ºF Average low in ºF
## 5 Jan 3.03 42 27
## 4 Feb 2.48 44 28
## 8 Mar 3.23 53 35
## 1 Apr 3.15 64 44
## 9 May 4.13 75 54
## 7 Jun 3.23 83 63
## 6 Jul 4.13 87 68
## 2 Aug 4.88 84 66
## 12 Sep 3.82 78 59
## 11 Oct 3.07 67 48
## 10 Nov 2.83 55 38
## 3 Dec 2.80 45 29
## Days with precipitation Hours of sunshine
## 5 11 76
## 4 8 97
## 8 12 135
## 1 10 182
## 9 12 221
## 7 9 214
## 6 10 226
## 2 10 186
## 12 8 170
## 11 8 123
## 10 8 87
## 3 9 66
summary(tidy)
## Month Av. precipitation in inch Average high in ºF Average low in ºF
## Jan :1 Min. :2.480 Min. :42.00 Min. :27.00
## Feb :1 1st Qu.:2.980 1st Qu.:51.00 1st Qu.:33.50
## Mar :1 Median :3.190 Median :65.50 Median :46.00
## Apr :1 Mean :3.398 Mean :64.75 Mean :46.58
## May :1 3rd Qu.:3.897 3rd Qu.:79.25 3rd Qu.:60.00
## Jun :1 Max. :4.880 Max. :87.00 Max. :68.00
## (Other):6
## Days with precipitation Hours of sunshine
## Min. : 8.000 Min. : 66.0
## 1st Qu.: 8.000 1st Qu.: 94.5
## Median : 9.500 Median :152.5
## Mean : 9.583 Mean :148.6
## 3rd Qu.:10.250 3rd Qu.:193.0
## Max. :12.000 Max. :226.0
##
From the summary, there appears to be a relationship between average precipittion and hours of sunshine. I grouped those variables and used ggplot to confirm.
tidy2 <- tidy %>% group_by(Month, `Av. precipitation in inch` ) %>% summarise(`Hours of sunshine`)
tidy2
## # A tibble: 12 x 3
## # Groups: Month [12]
## Month `Av. precipitation in inch` `Hours of sunshine`
## <ord> <dbl> <dbl>
## 1 Jan 3.03 76
## 2 Feb 2.48 97
## 3 Mar 3.23 135
## 4 Apr 3.15 182
## 5 May 4.13 221
## 6 Jun 3.23 214
## 7 Jul 4.13 226
## 8 Aug 4.88 186
## 9 Sep 3.82 170
## 10 Oct 3.07 123
## 11 Nov 2.83 87
## 12 Dec 2.8 66
g=ggplot(tidy2, aes(x=`Av. precipitation in inch`,y=`Hours of sunshine`,group = Month, color = Month))
g=g+geom_point(stat="identity",size=8)
g=g+ggtitle("Precipitation Due to Sunshine")
g=g+ylab("Hours of Sunshine")+xlab("Av. precipitation in inch")
g=g+theme_get()
g=g+theme(plot.title = element_text(hjust = 0.5),text=element_text(size=13))
g
g