(a).Go to the search engine for the Global Lake and River Ice Phenology Database: http://nsidc.org/data/lake_river_ice/freezethaw.html Type a name code in the appropriate box (in capitals). Suitable lakes for this study include JGL03 (Iowa); DMR1, JJM22, JJM27, and JJM33 (Wisconsin); KMS10, KMS11, and KMS14 (New York); GW240, GW341, GW369, GW512, and GW592 (Sweden); JK02,JK03, JK09, JK17, JK31, JK48 (Finland); and NG1 (Russia). In the output parameter options list, choose Ice Off Date, Ice Duration, Latitude, Longitude, Lake Name, and Country Name. In the output sort options list, choose Ice Season. Click the Submit Request button.
We have downloaded the NY file for this analysis.
(b).copy the data & keep only Year and Duration.Also remove rows with invalid data value of -999
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Pull the csv file into a data frame.
icedata <- read.table(file="liag_ice.csv", header=TRUE, sep=",")
#notice few observations
head(icedata)
## iceoff_year iceoff_month iceoff_day duration latitude longitude
## 1 1843 4 26 -999 42.75 -74.89
## 2 1844 4 13 -999 42.75 -74.89
## 3 1845 4 1 -999 42.75 -74.89
## 4 1846 4 7 -999 42.75 -74.89
## 5 1847 4 25 -999 42.75 -74.89
## 6 1848 4 10 -999 42.75 -74.89
## lakename country
## 1 OTSEGO UNITED STATES
## 2 OTSEGO UNITED STATES
## 3 OTSEGO UNITED STATES
## 4 OTSEGO UNITED STATES
## 5 OTSEGO UNITED STATES
## 6 OTSEGO UNITED STATES
#copy it to the dplyr data frame
icedata_df <- tbl_df(icedata)
#select only Year and Duration
icedata_df <- select(icedata_df, iceoff_year, duration)
#filter the invalid observations.
icedata_df <- filter(icedata_df, iceoff_year != -999 & duration != -999)
(c) Fit a linear model to the data. Do three different calculations: (1) the full data set, (2) the years 1930–1970, and (3) the years 1970 to the present.Of particular interest is the slope.
#Full data
plot(icedata_df$iceoff_year, icedata_df$duration, main="New York Lake and River Ice Phenology",sub="http://nsidc.org/data/lake river ice/freezethaw.html", xlab="Year", ylab="Duration")
fit1 <- lm(icedata_df$duration ~ icedata_df$iceoff_year)
#fit1 y = mx + b
#Slope
fit1$coefficients[2] #Slope is -0.1164184
## icedata_df$iceoff_year
## -0.1164184
#Correlation
cor(icedata_df$iceoff_year, icedata_df$duration) #Correlation co-efficient is -0.2481434
## [1] -0.2481434
abline(fit1) #This shows that line is trending slightly negative ( downwards)
#residual sum
sum((round(residuals(summary(fit1)),4))^2)
## [1] 64191.14
#Using ggplot draw the lm
#install.packages("ggplot2")
library(ggplot2)
ggplot(icedata_df, aes(x = iceoff_year, y = duration))+stat_summary(fun.data=mean_cl_normal)+geom_smooth(method="lm")
## Warning: Removed 155 rows containing missing values (geom_segment).
#-------------------------------------------------------------------------------------------------#
#Year 1930 to 1970
icedata_1930_70 <- filter(icedata_df, iceoff_year >= 1930 & iceoff_year <= 1970)
plot(icedata_1930_70$iceoff_year, icedata_1930_70$duration, main="New York Lake and River Ice Phenology(1930 to 1970)",sub="http://nsidc.org/data/lake river ice/freezethaw.html", xlab="Year", ylab="Duration")
fit2 <- lm(icedata_1930_70$duration ~ icedata_1930_70$iceoff_year)
#fit1 y = mx + b
#Slope
fit2$coefficients[2] #Slope is 0.5174216
## icedata_1930_70$iceoff_year
## 0.5174216
#Correlation
cor(icedata_1930_70$iceoff_year, icedata_1930_70$duration) #Correlation co-efficient is 0.2995863
## [1] 0.2995863
abline(fit2) #This shows that line is trending slightly positive ( upwards)
sum((round(residuals(summary(fit2)),4))^2)
## [1] 15585.36
#Using ggplot draw the lm
ggplot(icedata_1930_70, aes(x = iceoff_year, y = duration))+stat_summary(fun.data=mean_cl_normal)+geom_smooth(method="lm")
## Warning: Removed 41 rows containing missing values (geom_segment).
#-------------------------------------------------------------------------------------------------#
#Year 1970 to Now
icedata_1970_14 <- filter(icedata_df, iceoff_year >= 1970 & iceoff_year <= 2014)
plot(icedata_1970_14$iceoff_year, icedata_1970_14$duration, main="New York Lake and River Ice Phenology(1970 to Now)",sub="http://nsidc.org/data/lake river ice/freezethaw.html", xlab="Year", ylab="Duration")
fit3 <- lm(icedata_1970_14$duration ~ icedata_1970_14$iceoff_year)
#fit1 y = mx + b
#Slope
fit3$coefficients[2] #Slope is -0.4329081
## icedata_1970_14$iceoff_year
## -0.4329081
#Correlation
cor(icedata_1970_14$iceoff_year, icedata_1970_14$duration) #Correlation co-efficient is -0.2493461
## [1] -0.2493461
abline(fit3) #This shows that line is trending slightly negative ( downwards)
sum((round(residuals(summary(fit3)),4))^2)
## [1] 10371.14
#Using ggplot draw the lm
ggplot(icedata_1970_14, aes(x = iceoff_year, y = duration))+stat_summary(fun.data=mean_cl_normal)+geom_smooth(method="lm")
## Warning: Removed 35 rows containing missing values (geom_segment).
(d) Plot all of the data as points on a graph. Explain why it is a mistake to connect the points to make a dot-to-dot graph
plot(icedata_df$iceoff_year, icedata_df$duration, main="New York Lake and River Ice Phenology",sub="http://nsidc.org/data/lake river ice/freezethaw.html", xlab="Year", ylab="Duration")
The data points are spread(/sprinkled) across the periods and having a dot-to-dot graph will be tedious and will not help us deriving any meaningful statistical conclustions here.
(e) Add the three linear regression lines to the plot, being careful to use only the appropriate time interval for each.
#Add the three linear regression lines to the plot.
plot(icedata_df$iceoff_year, icedata_df$duration, main="New York Lake and River Ice Phenology",sub="http://nsidc.org/data/lake river ice/freezethaw.html", xlab="Year", ylab="Duration")
abline(fit1) #for whole data
abline(fit2, col="red") # from 1930 to 1970
abline(fit3, col="blue") # from 1970 to now.
#Lets try this in ggplot2
qplot(x=iceoff_year, y=duration, data=icedata_df) +
geom_abline(intercept=fit1$coefficients[[1]], slope=fit1$coefficients[[2]]) +
geom_abline(intercept=fit2$coefficients[[1]], slope=fit2$coefficients[[2]], colour="red") +
geom_abline(intercept=fit3$coefficients[[1]], slope=fit3$coefficients[[2]], colour="blue")
(f) What do the results appear to say about global climate change ?
From the above, If we consider just the fit1 ( whole period) Or, fit3 ( 1970 to now ) the trend appears to be negative ( duration decreases with increase in years), however, if we include the fit2 ( 1930 to 1970 period) the slope is positive (& correlation is 0.2995863), which indicates a slightly positive trend ( duration increased with in those years). So, overall, the global climate change appears to be trending downwards, meaning, the duration of ice cover is slightly decreasing over the years.