GEOG 6000
Date: 10/21/2015
Sulochan Dhungel
setwd("C:/Users/Sulochan/Copy/Fall 2015/Advance Geo/Assignment 5")
GlbTemp = read.csv("GlobalTemp.csv")
head(GlbTemp)
## Year NHTemp SHTemp
## 1 1000 -0.27668 -0.45604
## 2 1001 -0.23782 -0.40895
## 3 1002 -0.20315 -0.37730
## 4 1003 -0.17683 -0.35891
## 5 1004 -0.16292 -0.34768
## 6 1005 -0.16467 -0.33543
# NHTemp & SHTemp => annual temperature for the Northern and Southern Hemispheres
summary(GlbTemp)
## Year NHTemp SHTemp
## Min. :1000 Min. :-1.0527 Min. :-0.9452
## 1st Qu.:1249 1st Qu.:-0.4997 1st Qu.:-0.4482
## Median :1498 Median :-0.3815 Median :-0.3234
## Mean :1498 Mean :-0.3804 Mean :-0.3175
## 3rd Qu.:1746 3rd Qu.:-0.2734 3rd Qu.:-0.1936
## Max. :1995 Max. : 0.2534 Max. : 0.1904
#boxplot(GlbTemp$NHTemp, GlbTemp$SHTemp)
plot(1:dim(GlbTemp)[1], GlbTemp$NHTemp, "l", lwd = 2, col = "red", xlab = "Years", ylab = "Temp relative to Avg 1961-1990")
grid()
abline(h=mean(GlbTemp$NHTemp,na.rm=T), col = "red", lty=2)
lines(1:dim(GlbTemp)[1], GlbTemp$SHTemp, "l", lwd = 2, col = "blue")
abline(h=mean(GlbTemp$SHTemp,na.rm=T), col = "blue", lty=2)
legend("topleft",legend=c("N Hem","S Hem"), lty=1,lwd =2, col=c("red","blue"), bty="n")
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=0.6)
summary(NHTemp.loess1)
## Call:
## loess(formula = NHTemp ~ Year, data = GlbTemp, span = 0.6)
##
## Number of Observations: 996
## Equivalent Number of Parameters: 5.28
## Residual Standard Error: 0.1585
## Trace of smoother matrix: 5.77
##
## Control settings:
## normalize: TRUE
## span : 0.6
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=0.6)
summary(SHTemp.loess1)
## Call:
## loess(formula = SHTemp ~ Year, data = GlbTemp, span = 0.6)
##
## Number of Observations: 996
## Equivalent Number of Parameters: 5.28
## Residual Standard Error: 0.1779
## Trace of smoother matrix: 5.77
##
## Control settings:
## normalize: TRUE
## span : 0.6
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
Std_error_NHtemp = summary(NHTemp.loess1)$s
Std_error_SHtemp = summary(SHTemp.loess1)$s
Residual Error for North Hemisphere Temp Loess model = 0.1584804
Residual Error for South Hemisphere Temp Loess model = 0.1779037
newyrs <- data.frame(Year=seq(1000,2000,by=1))
newNHTemp <- predict(NHTemp.loess1, newdata=newyrs)
plot(GlbTemp$Year, GlbTemp$NHTemp,
col='red', pch=1,"p",cex=0.35,
xlab="Year",
ylab = "Temp relative to Avg 1961-1990",
main ="LOESS models (window = 60%)")
lines(newyrs$Year, newNHTemp,
col="red", lwd=2,"l",pch=0.2)
newSHTemp <- predict(SHTemp.loess1, newdata=newyrs)
lines(GlbTemp$Year, GlbTemp$SHTemp, col='blue', pch=1,"p",cex=0.35)
lines(newyrs$Year, newSHTemp, col="blue", lwd=2,"l",pch=0.2)
legend("topleft",legend=c("N Hem","S Hem"),
lty=1,lwd =2, col=c("red","blue"), bty="n")
On the basis of the figures, I think the temperatures trends were not same for both hemispheres. For example, from 1000 to around 1300, temperature was increasing in Southern hemisphere while it was decreasing in Northern Hemisphere. Also during the last 200 years, temperatures in both hemisphere have increased.
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.05)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.05)
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.15)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.15)
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=.25)
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=.25)
Based on the three different windows tried (5%, 15% and 25%), emphasis on higher frequency variability decreased withe increase in span (window size), due to the nature of LOESS regression. But at 5%, the high frequency variability was captured but smoothing was lost. At 25%, the smoothing was high so high frequency variabilty was not captured well. So, I think, the best window for capturing high frequency variability for this given dataset would be at 15%.
Extra: RELATION BETWEEN SPAN AND STANDARD ERROR
Std_error_NHtemp = c()
Std_error_SHtemp = c()
span_ctr = c()
for (i in 1:100){
span_ctr = c(span_ctr, 0.01*i)
NHTemp.loess1 <- loess(NHTemp ~ Year, data=GlbTemp, span=span_ctr[i])
SHTemp.loess1 <- loess(SHTemp ~ Year, data=GlbTemp, span=span_ctr[i])
Std_error_NHtemp = c(Std_error_NHtemp, summary(NHTemp.loess1)$s)
Std_error_SHtemp = c(Std_error_SHtemp, summary(SHTemp.loess1)$s)
}
plot(span_ctr, Std_error_SHtemp,col="blue","l",
xlab = "span", ylab ="Standard Error",
main = "Variation of std error with span")
lines(span_ctr, Std_error_NHtemp,col="red","l")
legend("topleft",legend=c("N Hem","S Hem"),
lty=1,lwd =2, col=c("red","blue"), bty="n")
grid()
This portion of code was writtend to check if standard error could give an idea on choosing span. The figure shows that at about span size = 0.25, the standard error almost levels out and does not increase with increasing span size. So, if the span size can be reduced below 0.25 for the specific case, without hurting smoothing too much, it would help improve the loess model.
pollen = read.csv("WNApollen.csv")
#head(pollen)
dim(pollen)
## [1] 2013 95
#install.packages("mgcv",repos="http://cran.us.r-project.org")
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-7. For overview type 'help("mgcv-package")'.
#colnames(pollen)
pairs(~ARTEMISIA+gdd5+chill+mtco+mipt,data=pollen,cex=0.5)
pollen.gam = gam(ARTEMISIA~s(gdd5)+s(chill)+s(mtco)+s(mipt), data=pollen)
pollen.gam
##
## Family: gaussian
## Link function: identity
##
## Formula:
## ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
##
## Estimated degrees of freedom:
## 7.47 8.49 7.43 8.67 total = 33.05
##
## GCV score: 0.01747729
Null Hypothesis: No relationship between response and predictor
Alternate Hypothesis: Some relationship exists between response and predictor
summary(pollen.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.091524 0.002922 31.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(gdd5) 7.467 8.378 10.335 9.46e-15 ***
## s(chill) 8.493 8.906 10.453 1.17e-15 ***
## s(mtco) 7.428 8.357 5.287 1.53e-06 ***
## s(mipt) 8.665 8.964 15.528 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.292 Deviance explained = 30.3%
## GCV = 0.017477 Scale est. = 0.01719 n = 2013
The results indicate that allfour terms are significant, which indicates all of the variables have a relationship with response- abudance of Artemisia.
Overall R - squared = 0.292
Percentage of deviance explained = 30.3%
pollen.gam2 = gam(ARTEMISIA~s(mtco)+s(mipt), data=pollen)
summary(pollen.gam2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## ARTEMISIA ~ s(mtco) + s(mipt)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.091524 0.003046 30.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(mtco) 8.679 8.968 37.18 <2e-16 ***
## s(mipt) 8.186 8.822 32.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.23 Deviance explained = 23.7%
## GCV = 0.01885 Scale est. = 0.018682 n = 2013
New R-squared = 0.23
Percentage of deviance explained = 23.7%
anova(pollen.gam2, pollen.gam)
## Analysis of Deviance Table
##
## Model 1: ARTEMISIA ~ s(mtco) + s(mipt)
## Model 2: ARTEMISIA ~ s(gdd5) + s(chill) + s(mtco) + s(mipt)
## Resid. Df Resid. Dev Df Deviance
## 1 1995.1 37.274
## 2 1979.9 34.036 15.188 3.2379
Since the decrease in deviance between the model is only 3.24, I do not consider this model to be worse than the first one (with 4 variables).
vis.gam(pollen.gam2, theta=130, phi=40, ticktype="detailed")
# mtco = Mean winter temperature
# mipt = Priestly-Taylor Moisture Index (0-dry, 1-wet)
From the figure, the optimal climate lies when mean winter temperature (mtco) is about -10 degrees and moisture index(mipt) is close to 0.3.
newclim <- data.frame(mtco=-10, mipt=0.35)
predict(pollen.gam2, newdata=newclim, se.fit=TRUE)
## $fit
## 1
## 0.2324331
##
## $se.fit
## 1
## 0.01276561
The predicted value of Artemisia at -10 deg C and moisture index of 0.35, is 0.2324.
birthwt = read.csv("lowbirthwgt.csv")
head(birthwt)
## low age lwt race smoke ptl ht ui ftv bwt
## 1 0 19 182 2 0 0 0 1 0 2523
## 2 0 33 155 3 0 0 0 0 3 2551
## 3 0 20 105 1 1 0 0 0 1 2557
## 4 0 21 108 1 1 0 0 1 2 2594
## 5 0 18 107 1 1 0 0 1 0 2600
## 6 0 21 124 3 0 0 0 0 0 2622
#install.packages("randomForest",repos="http://cran.us.r-project.org")
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
birthwt$low <- factor(birthwt$low, labels=c("normal","low"))
birthwt$race <- factor(birthwt$race, labels=c("white","black","other"))
birthwt$smoke <- factor(birthwt$smoke, labels=c("notsmoker","smoker"))
birthwt$ptl <- factor(birthwt$ptl, ordered=TRUE)
birthwt$ht <- factor(birthwt$ht, labels=c("noht","ht"))
birthwt$ui <- factor(birthwt$smoke, labels=c("noui","ui"))
birthwt$ftv <- factor(birthwt$ftv, ordered=TRUE)
birthwt2 <- birthwt[,-10]
head(birthwt2)
## low age lwt race smoke ptl ht ui ftv
## 1 normal 19 182 black notsmoker 0 noht noui 0
## 2 normal 33 155 other notsmoker 0 noht noui 3
## 3 normal 20 105 white smoker 0 noht ui 1
## 4 normal 21 108 white smoker 0 noht ui 2
## 5 normal 18 107 white smoker 0 noht ui 0
## 6 normal 21 124 other notsmoker 0 noht noui 0
birthwt2.rf1 = randomForest(low ~ .,dat=birthwt2,ntree=500, mtry=2)
birthwt2.rf1
##
## Call:
## randomForest(formula = low ~ ., data = birthwt2, ntree = 500, mtry = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 32.28%
## Confusion matrix:
## normal low class.error
## normal 114 16 0.1230769
## low 45 14 0.7627119
birthwt2.rf1$confusion
## normal low class.error
## normal 114 16 0.1230769
## low 45 14 0.7627119
The confusion matrix shows that the error for normal babies is lot less than the low weight babies.
plot(birthwt2.rf1, main = "Model misclassification error")
The plot shows that errors have stabilized. This type of error rates is available only for classification in randomforest.
varImpPlot(birthwt2.rf1, main = "Variable importance plot ")
The most important factors controlling low birth outcomes are lwt (MOther’s weight) and age (Age of mother). There is a sudden drop in mean decrease in Gini Index after these two variables.
## Data for prediction
newdata <- read.csv("lowbirthwgtpred.csv")
newdata$low <- factor(newdata$low, levels=c("normal","low"),
labels=c("normal","low"))
newdata$race <- factor(newdata$race, levels=c("white","black","other"),
labels=c("white","black","other"))
newdata$smoke <- factor(newdata$smoke, levels=c("notsmoker","smoker"),
labels=c("notsmoker","smoker"))
newdata$ptl <- factor(newdata$ptl, ordered=TRUE)
newdata$ht <- factor(newdata$ht, levels=c("noht","ht"),
labels=c("noht","ht"))
newdata$ui <- factor(newdata$ui, levels=c("noui","ui"),
labels=c("noui","ui"))
newdata$ftv <- factor(newdata$ftv, ordered=TRUE)
newdata
## low age lwt race smoke ptl ht ui ftv
## 1 <NA> 21 100 white smoker 1 ht noui 3
predict(birthwt2.rf1, newdata)
## 1
## normal
## Levels: normal low
For the given data, random forest model predicts that the birth weight will be normal. To get the probability, we use results from all trees.
newdata.pred = predict(birthwt2.rf1, newdata, predict.all=TRUE)
table(newdata.pred$individual)
##
## low normal
## 170 330
The probability of low birth for this person = 174/500 = 34.8%